1 (uiop:define-package
#:lw2.clean-html
2 (:use
#:cl
#:alexandria
#:iterate
#:split-sequence
#:lw2.lmdb
#:lw2.links
#:lw2.utils
#:lw2.context
#:lw2.sites
#:lw2.conditions
#:lw2.colors
)
3 (:export
#:*before-clean-hook
* #:*link-hook
* #:url-scanner
#:clean-text
#:clean-text-to-html
#:clean-html
#:clean-html
* #:extract-excerpt
#:extract-excerpt
*)
4 (:unintern
#:*text-clean-regexps
* #:*html-clean-regexps
*))
6 (in-package #:lw2.clean-html
)
8 (setf cl-typesetting-hyphen
::*hyphen-patterns-directory
* (asdf:system-relative-pathname
"lw2-viewer" "data/hyphenation-patterns/"))
9 (setf cl-typesetting-hyphen
::*language-hyphen-file-list
* '((:en-us .
"hyph_en_US")))
10 (cl-typesetting-hyphen:load-language
:en-us
)
11 (setf cl-typesetting
::*default-hyphen-language
* :en-us
)
13 (defvar *before-clean-hook
* nil
)
14 (defvar *link-hook
* nil
)
16 (defun file-get-contents (filename)
17 (with-open-file (stream filename
)
18 (uiop:slurp-stream-string stream
)))
20 (defun grab-from-rts (url)
21 (declare (optimize (speed 0) (space 3)))
22 (let* ((root (plump:parse
(dex:get url
:keep-alive nil
)))
23 (post-body (plump:get-element-by-id root
"wikitext")))
24 (loop for cls in
'("div.nav_menu" "div.imgonly" "div.bottom_nav") do
25 (loop for e across
(clss:select cls post-body
)
26 do
(plump:remove-child e
)))
27 (plump:remove-child
(elt (clss:select
"h1" post-body
) 0))
28 (plump:remove-child
(elt (clss:select
"p" post-body
) 0))
29 (with-open-file (stream (merge-pathnames "./rts-content/" (subseq (quri:uri-path
(quri:uri url
)) 1)) :direction
:output
:if-does-not-exist
:create
:external-format
:utf-8
)
30 (plump:serialize post-body stream
))))
32 (defun rts-to-html (file)
33 (declare (optimize (speed 0) (space 3)))
36 (file-get-contents "./rts-content/rts.css")
38 (file-get-contents (merge-pathnames "./rts-content/" file
))))
40 (defparameter *html-overrides
* (make-hash-table :test
'equal
))
41 (loop for
(id file
) in
'(("XTXWPQSEgoMkAupKt" "An-Intuitive-Explanation-Of-Bayess-Theorem")
42 ("afmj8TKAqH6F2QMfZ" "A-Technical-Explanation-Of-Technical-Explanation")
43 ("7ZqGiPHTpiDMwqMN2" "The-Twelve-Virtues-Of-Rationality"))
44 do
(let ((file* file
)) (setf (gethash id
*html-overrides
*) (lambda () (rts-to-html file
*)))))
46 (defmacro do-with-cleaners
((regexp-list scanner replacement
) &body body
)
47 (declare (optimize (speed 0) (space 3)))
48 `(labels ((fn (,scanner
,replacement
) ,@body
))
49 ,@(loop for
(regex flags replacement
) in
(eval regexp-list
)
50 collecting
`(fn (load-time-value
51 (ppcre:create-scanner
,regex
52 ,@(loop for
(flag sym
) in
'((#\i
:case-insensitive-mode
)
53 (#\m
:multi-line-mode
)
54 (#\s
:single-line-mode
)
56 when
(find flag flags
)
57 append
(list sym t
))))
60 (defmacro define-cleaner
(name regexp-list
)
61 (declare (optimize (speed 0) (space 3)))
63 (do-with-cleaners (,regexp-list scanner replacement
)
64 (setf text
(ppcre:regex-replace-all scanner text replacement
)))))
66 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
67 (defun read-regexp-file (filename)
68 (declare (optimize (speed 0) (space 3)))
69 (let ((data (destructuring-bind (* ((* (* inner
))))
70 (with-open-file (stream (uiop:subpathname
(asdf:system-source-directory
"lw2-viewer") filename
)) (parse-js:parse-js stream
))
72 (loop for input in data
73 collecting
(destructuring-bind (* ((* regex flags
) (* replacement
))) input
74 (list regex flags
(ppcre:regex-replace-all
"\\$(\\d)" replacement
"\\\\\\1")))))))
76 (define-cleaner clean-text
(read-regexp-file "text-clean-regexps.js"))
77 (define-cleaner clean-html-regexps
(read-regexp-file "html-clean-regexps.js"))
79 (declaim (ftype function url-scanner
))
80 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
81 (setf (fdefinition 'url-scanner
) (ppcre:create-scanner
82 "(?:https?://[-a-zA-Z0-9]+\\.[-a-zA-Z0-9.]+|[-a-zA-Z0-9.]+\\.(?:com|edu|gov|mil|net|org|int|biz|info|name|museum|us|ca|uk|io|ly))(?:\\:[0-9]+){0,1}(?:(?:/|\\?(?!(?:$|\\s)))(?:(?:(\\()|\\)(?![.,;:?!]?(?:$|\\s))|[-\\w\\d.,;:?'\\\\+@!&%$#=~–_/])*(?(1)[-\\w\\d\\\\+@&%$#=~_/)]|[-\\w\\d\\\\+@&%$#=~_/]))?)?"
83 :single-line-mode t
)))
85 (defun hyphenate-string (string)
86 (let ((hyphenation-list (cl-typesetting::hyphenate-string string
)))
87 (declare (type string string
)
88 (type list hyphenation-list
))
90 (let ((new-string (make-array (+ (length string
) (length hyphenation-list
)) :element-type
'character
:fill-pointer
0)))
91 (loop for char across string
92 for orig-offset of-type fixnum from
0
93 with current-hyphenation
= hyphenation-list
94 do
(when (and current-hyphenation
(= orig-offset
(the fixnum
(first current-hyphenation
))))
95 (vector-push #\SOFT_HYPHEN new-string
)
96 (setf current-hyphenation
(rest current-hyphenation
)))
97 do
(vector-push char new-string
))
98 (values (coerce new-string
'(simple-array character
1)) hyphenation-list
))
99 (values string nil
))))
101 (defun clean-text-to-html (text &key
(hyphenation t
))
103 (((or plump
:invalid-xml-character plump
:discouraged-xml-character
) #'abort
))
105 (plump:encode-entities
107 (funcall (if hyphenation
#'hyphenate-string
#'identity
) (clean-text text
))
110 (defun rot13-char (char)
111 (let ((char-code (char-code char
)))
112 (labels ((translate-char (base)
113 (code-char (+ base
(mod (+ 13 (- char-code base
)) 26)))))
114 (declare (dynamic-extent #'translate-char
))
116 ((<= (char-code #\A
) char-code
(char-code #\Z
))
117 (translate-char (char-code #\A
)))
118 ((<= (char-code #\a) char-code
(char-code #\z
))
119 (translate-char (char-code #\a)))
122 (declaim (inline letter-index
))
123 (defun letter-index (char)
124 (let ((char-code (char-code char
)))
126 ((<= (char-code #\A
) char-code
(char-code #\Z
))
127 (- char-code
(char-code #\A
)))
128 ((<= (char-code #\a) char-code
(char-code #\z
))
129 (- char-code
(char-code #\a))))))
131 (defparameter *letter-frequencies
*
132 '((#\A .
8.167) (#\B .
1.492) (#\C .
2.782) (#\D .
4.253) (#\E .
12.702)
133 (#\F .
2.228) (#\G .
2.015) (#\H .
6.094) (#\I .
6.966) (#\J .
0.153)
134 (#\K .
0.772) (#\L .
4.025) (#\M .
2.406) (#\N .
6.749) (#\O .
7.507)
135 (#\P .
1.929) (#\Q .
0.095) (#\R .
5.987) (#\S .
6.327) (#\T .
9.056)
136 (#\U .
2.758) (#\V .
0.978) (#\W .
2.36) (#\X .
0.15) (#\Y .
1.974)
139 (defparameter *letter-rot13-log-odds
*
140 (let ((array (make-array 26 :element-type
'single-float
)))
142 for letter-freq in
*letter-frequencies
*
143 for rot13-freq in
(concatenate 'list
(subseq *letter-frequencies
* 13 26) (subseq *letter-frequencies
* 0 13))
145 do
(setf (aref array i
) (log (/ (cdr letter-freq
) (cdr rot13-freq
)) 2)))
148 (defparameter *bigram-rot13-log-odds
*
149 (let ((bigram-data (with-open-file (stream (asdf:system-relative-pathname
"lw2-viewer" "data/bigrams.lisp"))
151 (bigram-table (make-array '(26 26) :element-type
'single-float
:initial-element
1.0))
152 (bigram-log-odds (make-array '(26 26) :element-type
'single-float
)))
154 for
(bigram count
) in bigram-data
155 do
(setf (aref bigram-table
156 (letter-index (aref bigram
0))
157 (letter-index (aref bigram
1)))
158 (float (+ count
1))))
161 (setf (aref bigram-log-odds i j
)
162 (log (/ (aref bigram-table i j
)
169 (defun rot13-text-p (text &optional
(start 0) (end (length text
)))
170 (declare (type simple-string text
))
176 (let ((odds-table (load-time-value *letter-rot13-log-odds
*)))
177 (declare (type (simple-array single-float
(26)) odds-table
))
178 (if-let ((letter-index (letter-index (aref text start
))))
179 (aref odds-table letter-index
)
182 (let ((odds-table (load-time-value *bigram-rot13-log-odds
*)))
183 (declare (type (simple-array single-float
(26 26)) odds-table
))
186 for i from
(1+ start
) to
(1- end
)
187 do
(when-let ((a (letter-index (aref text
(1- i
))))
188 (b (letter-index (aref text i
))))
189 (setf sum
(+ sum
(aref odds-table a b
))))
190 finally
(return sum
)))))))
192 (< log-odds
(log 0.0001 2))
195 (defun rot13-inplace (text &optional
(start 0) (end (1- (length text
))))
196 (declare (type simple-string text
))
198 for i from start to end
199 do
(setf (aref text i
) (rot13-char (aref text i
))))
202 (defun unrot13-by-words (text)
203 (declare (type simple-string text
))
208 when
(or (= i
(1- (length text
)))
209 (position char
",.:;?! "))
211 (when (< (nth-value 1 (rot13-text-p text word-start i
)) (+ 0 (- i word-start
0)))
212 (rot13-inplace text word-start i
))
213 (setf word-start i
)))
216 (declaim (ftype (function (plump:node
) list
) class-list
))
218 (defun class-list (node)
219 (let ((class (plump:attribute node
"class")))
220 (and class
(split-sequence #\Space class
))))
222 (declaim (ftype (function (plump:node
&rest simple-string
) boolean
) tag-is class-is-not text-class-is-not
))
224 (defun tag-is (node &rest args
)
225 (declare (type plump
:node node
)
226 (dynamic-extent args
))
227 (when (plump:element-p node
)
228 (let ((tag (plump:tag-name node
)))
230 (some (lambda (x) (string= tag x
))
233 (defun every-ancestor (node test
)
234 (declare (type plump
:node node
)
235 (type function test
))
236 (iter (for target first node then
(plump:parent target
))
237 (cond ((or (plump:root-p target
) (null target
)) (return t
))
238 ((not (funcall test target
)) (return nil
)))))
240 (defun class-is-not (node &rest args
)
241 (declare (type plump
:node node
)
242 (dynamic-extent args
))
243 (every-ancestor node
(lambda (n)
249 (defun text-class-is-not (node &rest args
)
250 (declare (type plump
:node node
)
251 (dynamic-extent args
))
252 (apply #'class-is-not
(plump:parent node
) args
))
254 (defun clean-dom-text (root)
256 (((or plump
:invalid-xml-character plump
:discouraged-xml-character
) #'abort
))
259 (and (plump:element-p node
)
260 (ppcre:scan
"^(?:p|div|blockquote|li|h[0-6])$" (plump:tag-name node
))))
262 (and (plump:text-node-p node
)
264 (text-class-is-not node
"mjx-math" "arbital-math")
265 (not (tag-is (plump:parent node
) "code"))))
266 (traverse (node main-fn
&optional recurse-fn
)
267 (when (cleanablep node
) (funcall main-fn node
))
268 (when (plump:nesting-node-p node
)
269 (loop for n across
(plump:children node
)
271 (if recurse-fn
(funcall recurse-fn n
))
272 (traverse n main-fn recurse-fn
))))))
273 (let* ((offset-list nil
)
275 (with-output-to-string (stream)
279 (push (length (the string
(plump:text node
))) offset-list
)
280 (write-string (plump:text node
) stream
))
282 (whole-string-output whole-string-input
))
283 (declare (type string whole-string-output whole-string-input
))
284 (setf offset-list
(nreverse offset-list
))
286 ((call-with-offset-loop (continue-fn loop-fn next-offset-fn offset-adjust-fn
)
287 (loop with current-offset
= offset-list
288 with output-offset of-type
(or null fixnum
) = (first current-offset
)
289 with output-offset-list
= nil
290 with total-offset of-type fixnum
= 0
291 while
(funcall continue-fn
)
293 do
(loop for current-offset-num of-type fixnum
= (first current-offset
)
294 while
(and (rest current-offset
) (< (+ total-offset current-offset-num
) (funcall next-offset-fn
)))
296 (push output-offset output-offset-list
)
297 (setf total-offset
(+ total-offset current-offset-num
)
298 current-offset
(cdr current-offset
)
299 output-offset
(first current-offset
))))
300 do
(setf output-offset
(funcall offset-adjust-fn output-offset
))
302 (push output-offset output-offset-list
)
303 (loop for x in
(rest current-offset
) do
(push x output-offset-list
))
304 (setf offset-list
(nreverse output-offset-list
))))
306 (declare (dynamic-extent (function call-with-offset-loop
))
307 (ftype (function ((function ()) (function ()) (function () fixnum
) (function (fixnum) fixnum
)) (values)) call-with-offset-loop
))
309 ((offset-loop ((list-binding list-form
) (&body loop-body
) (&body next-offset-body
) (&body offset-adjust-body
))
310 (with-gensyms (list-current)
311 `(let ((,list-current
,list-form
)
313 (labels ((continue-fn () (if ,list-current
(setf ,list-binding
(pop ,list-current
))))
314 (loop-fn () ,.loop-body
)
315 (next-offset-fn () ,.next-offset-body
)
316 (offset-adjust-fn ,(first offset-adjust-body
) (declare (type fixnum
,(caar offset-adjust-body
)) (values fixnum
)) ,.
(rest offset-adjust-body
)))
317 (declare (dynamic-extent (function continue-fn
) (function loop-fn
) (function next-offset-fn
) (function offset-adjust-fn
)))
318 (call-with-offset-loop #'continue-fn
#'loop-fn
#'next-offset-fn
#'offset-adjust-fn
))))))
319 (do-with-cleaners ((read-regexp-file "text-clean-regexps.js") scanner replacement
)
320 (let ((replacements 0)
321 (replacement-list nil
)
322 (original-length (length whole-string-output
)))
323 (declare (type fixnum replacements
))
324 (ppcre:do-scans
(match-start match-end reg-starts reg-ends scanner whole-string-output
)
325 (declare (type fixnum match-start match-end
)
326 (type simple-vector reg-starts reg-ends
))
329 (list (if (and (> (length reg-starts
) 0) (eq (aref reg-starts
0) match-start
))
332 (if (and (> (length reg-starts
) 0) (eq (aref reg-ends
(- (length reg-ends
) 1)) match-end
))
333 (aref reg-starts
(- (length reg-starts
) 1))
336 (setf replacement-list
(nreverse replacement-list
))
337 (setf whole-string-output
(ppcre:regex-replace-all scanner whole-string-output replacement
))
338 (let ((length-difference (- (length whole-string-output
) original-length
))
340 (declare (type fixnum length-difference length-change
))
342 (current-replacement replacement-list
)
343 ((setf length-change
(ceiling length-difference replacements
)
344 length-difference
(- length-difference length-change
)
345 replacements
(- replacements
1)))
346 ((destructuring-bind (start end
) current-replacement
347 (declare (type fixnum start end
))
348 (ceiling (+ start end
) 2)))
349 ((output-offset) (max 0 (+ output-offset length-change
)))))))
350 (multiple-value-bind (hyphenated-string hyphenation-list
) (hyphenate-string whole-string-output
)
351 (setf whole-string-output hyphenated-string
)
353 (current-hyphenation hyphenation-list
)
355 (current-hyphenation)
356 ((output-offset) (1+ output-offset
))))))
357 (let ((current-offset 0))
358 (declare (type (or null fixnum
) current-offset
))
362 (let ((output-length (length whole-string-output
))
363 (next-offset (if offset-list
(+ current-offset
(the fixnum
(first offset-list
))) nil
)))
364 (declare (type (or null fixnum
) next-offset
))
365 (setf (plump:text node
) (subseq whole-string-output
(min current-offset output-length
) (and next-offset
(min next-offset output-length
)))
366 current-offset next-offset
367 offset-list
(cdr offset-list
))))
368 (lambda (node) (declare (ignore node
))))))))
371 ;;;; Dynamic content blocks.
373 (define-cache-database 'lw2.backend-modules
:backend-lmdb-cache
"dynamic-content-blocks")
375 (defparameter *dynamic-content-block-callback
* nil
)
377 (defclass dynamic-element
(plump:element
)
378 ((dynamic-call-form :initarg
:dynamic-call-form
:accessor dynamic-call-form
)))
380 (defmethod plump:serialize-object
:around
((node dynamic-element
))
381 (funcall *dynamic-content-block-callback
* :start node
)
383 (funcall *dynamic-content-block-callback
* :end node
))
385 (defun create-dynamic-call (node function-name
&rest args
)
386 (change-class node
'dynamic-element
:dynamic-call-form
(list* function-name args
)))
388 (defun call-with-dynamic-block-serialization (fn hash output-string
)
389 (let* ((current-start nil
)
392 (flet ((current-octet-pos ()
393 (psetf last-string-pos
(length output-string
)
394 last-octet-pos
(+ last-octet-pos
(babel:string-size-in-octets output-string
:start last-string-pos
)))
396 (declare (dynamic-extent #'current-octet-pos
))
397 (let* ((dynamic-call-list nil
)
398 (*dynamic-content-block-callback
*
401 (:start
(setf current-start
(current-octet-pos)))
402 (:end
(let ((current-end (current-octet-pos)))
403 (push (list* current-start current-end
(dynamic-call-form node
))
404 dynamic-call-list
)))))))
405 (declare (dynamic-extent *dynamic-content-block-callback
*))
406 (multiple-value-prog1
408 (when dynamic-call-list
409 (cache-put "dynamic-content-blocks" hash
(nreverse dynamic-call-list
) :key-type
:byte-vector
:value-type
:lisp
)))))))
411 (defmacro with-dynamic-block-serialization
((hash output-string
) &body body
)
412 `(dynamic-flet ((fn () ,@body
)) (call-with-dynamic-block-serialization #'fn
,hash
,output-string
)))
416 (define-lmdb-memoized extract-excerpt
'lw2.backend-modules
:backend-lmdb-cache
417 (:sources
("src/clean-html.lisp")) (in-html)
418 (let ((root (plump:parse
(string-trim '(#\Space
#\Newline
#\Tab
#\Return
#\Linefeed
#\Page
) in-html
)))
421 (with-output-to-string (out-stream)
426 (when (or (> (length (plump:children node
)) 1)
427 (plump:text-node-p
(plump:first-child node
)))
428 (let ((text (plump:text node
)))
430 (write-char #\Space out-stream
))
431 (write-string text out-stream
)
432 (setf chars
(+ chars
(length text
))
436 :test
(lambda (node) (tag-is node
"p")))))))
438 (define-lmdb-memoized clean-html
'lw2.backend-modules
:backend-lmdb-cache
439 (:sources
("src/clean-html.lisp" "src/links.lisp" "src/colors.lisp" "text-clean-regexps.js" "html-clean-regexps.js")) (in-html &key with-toc post-id
)
440 (declare (ftype (function (plump:node
) fixnum
) plump
:child-position
)
441 (ftype (function (plump:node
) (and vector
(not simple-array
))) plump
:family
)
442 (ftype (function (plump:node
) simple-string
) plump
:text plump
:tag-name
))
443 (labels ((only-child-is (node &rest args
)
444 (declare (dynamic-extent args
))
445 (and (= 1 (length (plump:children node
)))
446 (let ((child (plump:first-child node
)))
448 (plump:element-p child
)
449 (apply #'tag-is child args
)))))
450 (is-child-of-tag (node &rest args
)
451 (declare (dynamic-extent args
))
452 (loop for e
= (plump:parent node
) then
(plump:parent e
)
453 while
(not (typep e
'plump
:root
))
454 when
(and (plump:element-p e
) (apply #'tag-is
(cons e args
))) return t
))
455 (add-class (node class
)
456 (declare (type plump
:node node
)
458 (let ((classes (adjoin class
(alexandria:if-let
(attr (plump:attribute node
"class")) (split-sequence #\Space attr
)) :test
#'string
=)))
459 (declare (dynamic-extent classes
))
460 (setf (plump:attribute node
"class") (format nil
"~{~A~^ ~}" classes
)))
462 (remove-attributes (node &rest attrs
)
463 (declare (dynamic-extent attrs
))
465 (plump:remove-attribute node attr
)))
466 (make-element-before (node tag
)
467 (if (plump:text-node-p node
)
468 (make-element-before (plump:parent node
) tag
)
469 (let ((e (plump:make-element
(plump:parent node
) tag
)))
470 (plump:remove-child e
)
471 (plump:insert-before node e
)
472 (setf (plump:parent e
) (plump:parent node
))
474 (wrap-element (node element-name
)
475 (let ((container (make-element-before node element-name
)))
476 (plump:remove-child node
)
477 (plump:append-child container node
)
479 (wrap-children (node element-name
)
480 (let ((new-element (plump:make-element node element-name
)))
481 (plump:remove-child new-element
)
482 (setf (plump:children new-element
) (plump:clone-children node t new-element
)
483 (plump:children node
) (plump:make-child-array
))
484 (plump:append-child node new-element
)))
485 (move-children-out-of-node (node &key keep
)
486 (iterate (for c in-vector
(plump:children node
) downto
0)
487 (setf (plump:parent c
) (plump:parent node
))
488 (plump:insert-after node c
))
490 (setf (plump:children node
) (plump:make-child-array
))
491 (plump:remove-child node
)))
492 (text-node-is-not (node &rest args
)
493 (declare (type plump
:node node
)
494 (dynamic-extent args
))
496 (typep (plump:parent node
) 'plump
:root
)
497 (every (lambda (x) (string/= (plump:tag-name
(plump:parent node
)) x
)) args
)))
498 (adjacent-text-node (node direction
)
499 (multiple-value-bind (get-sibling insert-sibling
)
501 (:previous
(values #'plump
:previous-sibling
#'plump
:insert-before
))
502 (:next
(values #'plump
:next-sibling
#'plump
:insert-after
)))
503 (let ((candidate (funcall get-sibling node
)))
504 (if (plump:text-node-p candidate
)
506 (let ((new-node (plump:make-text-node
(plump:parent node
))))
507 (funcall insert-sibling node new-node
)
510 (let ((text (plump:text node
))
513 (loop for c across text
514 when
(alpha-char-p c
)
516 when
(digit-char-p c
)
518 finally
(return (>= digit alpha
)))))
519 (char-is-whitespace (c)
520 (or (cl-unicode:has-binary-property c
"White_Space")
521 (eql c
#\BRAILLE_PATTERN_BLANK
)))
522 (string-is-whitespace (string)
523 (every #'char-is-whitespace string
))
524 (remove-if-whitespace (node)
525 (when (string-is-whitespace (plump:text node
))
526 (plump:remove-child node
)))
527 (first-non-whitespace-child (node)
528 (loop for e across
(plump:children node
)
529 when
(or (typep e
'plump
:element
) (not (string-is-whitespace (plump:text e
)))) return e
))
530 (find-text-node (node direction
)
531 (let ((iterator #.
`(case direction
532 ,@(loop for d in
'(:first
:last
) collect
534 (iterate (for c in-vector
(plump:children node
) ,@(if (eq d
:last
) '(downto 0)))
535 (funcall fn c
))))))))
536 (declare (dynamic-extent iterator
))
538 ((and (plump:text-node-p node
) (plump:parent node
))
540 ((plump:nesting-node-p node
)
542 (funcall iterator
(lambda (c)
543 (when-let (tn (find-text-node c direction
)) (return tn
)))))))))
544 (vacuum-whitespace (node)
545 (dolist (direction '(:first
:last
))
546 (let ((displaced-text (make-string 0)))
547 (loop for lt
= (find-text-node node direction
) do
550 (return-from vacuum-whitespace node
))
551 ((string-is-whitespace (plump:text lt
))
552 (setf displaced-text
(case direction
553 (:first
(concatenate 'string displaced-text
(plump:text lt
)))
554 (:last
(concatenate 'string
(plump:text lt
) displaced-text
))))
555 (plump:remove-child lt
))
557 (let* ((text (plump:text lt
)))
560 (let ((boundary (loop for i from
0 to
(- (length text
) 1)
561 unless
(char-is-whitespace (aref text i
))
563 (setf displaced-text
(concatenate 'string displaced-text
(subseq text
0 boundary
))
564 (plump:text lt
) (subseq text boundary
))))
566 (let ((boundary (loop for i from
(- (length text
) 1) downto
0
567 unless
(char-is-whitespace (aref text i
))
569 (setf displaced-text
(concatenate 'string
(subseq text
(+ 1 boundary
) (length text
)))
570 (plump:text lt
) (subseq text
0 (+ 1 boundary
)))))))
572 (when (> (length displaced-text
) 0)
573 (let ((atn (adjacent-text-node node
(case direction
(:first
:previous
) (:last
:next
)))))
574 (setf (plump:text atn
) (case direction
575 (:first
(concatenate 'string
(plump:text atn
) displaced-text
))
576 (:last
(concatenate 'string displaced-text
(plump:text atn
))))))))))
577 (add-element-style (node attribute value
)
578 (let ((old-style (plump:attribute node
"style")))
579 (setf (plump:attribute node
"style")
581 (format nil
"~A~:[;~;~] ~A: ~A;" old-style
(ppcre:scan
";\s*$" old-style
) attribute value
)
582 (format nil
"~A: ~A;" attribute value
)))))
583 (style-string-to-alist (string)
584 (let ((rules (ppcre:split
"\\s*;\\s*" string
:sharedp t
)))
585 (iter (for rule in rules
)
586 (let ((parts (ppcre:split
"\\s*:\\s*" rule
:sharedp t
)))
587 (when (= 2 (length parts
))
588 (collect (cons (first parts
) (second parts
))))))))
589 (alist-to-style-string (alist)
590 (with-output-to-string (s)
591 (iter (for item in alist
)
592 (format s
"~A:~A;" (car item
) (cdr item
)))))
593 (remove-style-rules (node &rest rules
)
594 (declare (dynamic-extent rules
))
595 (when-let ((old-style (plump:attribute node
"style")))
596 (setf (plump:attribute node
"style")
597 (alist-to-style-string
598 (remove-if (lambda (x) (member (car x
) rules
:test
#'string-equal
))
599 (style-string-to-alist old-style
))))))
600 (flatten-element (node)
601 (let* ((previous-sibling (plump:previous-sibling node
))
602 (next-sibling (if (plump:text-node-p
(plump:next-sibling node
))
603 (plump:next-sibling node
)))
604 (new-text-node (if (plump:text-node-p previous-sibling
)
606 (plump:insert-before node
607 (plump:remove-child
(plump:make-text-node
(plump:parent node
)))))))
608 (setf (plump:parent new-text-node
) (plump:parent node
)
609 (plump:text new-text-node
) (concatenate 'string
610 (plump:text new-text-node
)
613 (plump:text next-sibling
)
615 (plump:remove-child node
)
616 (when next-sibling
(plump:remove-child next-sibling
))))
617 (scan-for-urls (text-node)
618 (declare (type plump
:text-node text-node
))
619 (let ((text (plump:text text-node
)))
620 (multiple-value-bind (url-start url-end
)
621 (ppcre:scan
#'url-scanner text
)
622 (declare (type simple-string text
)
623 (type (or null fixnum
) url-start url-end
))
625 (let* ((url-raw (subseq text url-start url-end
))
626 (url (if (mismatch "http" url-raw
:end2
4) (concatenate 'string
"http://" url-raw
) url-raw
))
627 (family (plump:family text-node
))
628 (other-children (prog1
629 (subseq family
(1+ (plump:child-position text-node
)))
630 (setf (fill-pointer family
) (1+ (plump:child-position text-node
)))))
631 (new-a (plump:make-element
(plump:parent text-node
) "a"))
632 (new-text (unless (= url-end
(length text
)) (plump:make-text-node
(plump:parent text-node
) (subseq text url-end
)))))
633 (setf (plump:text text-node
) (subseq text
0 url-start
)
634 (plump:attribute new-a
"href") (with-direct-link (presentable-link url
))
635 (plump:attribute new-a
"class") "bare-url")
636 (plump:make-text-node new-a
(clean-text url-raw
))
638 (scan-for-urls new-text
)
639 (setf (plump:text new-text
) (clean-text (plump:text new-text
))))
640 (loop for item across other-children
641 do
(plump:append-child
(plump:parent text-node
) item
))
642 (when (= url-start
0)
643 (plump:remove-child text-node
)))))))
644 (title-to-anchor (text used-anchors
)
645 ;; This should match LW behavior in packages/lesswrong/lib/collections/posts/tableOfContents.js
646 (let* ((chars-to-use "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789")
647 (base-anchor (with-output-to-string (stream)
648 (loop for c across text
649 do
(write-char (if (find c chars-to-use
) c
#\_
) stream
)))))
650 (loop for suffix from
0
651 for anchor
= base-anchor then
(format nil
"~A~A" base-anchor suffix
)
652 when
(not (gethash anchor used-anchors
))
653 return
(progn (setf (gethash anchor used-anchors
) t
)
655 (contents-to-html (contents min-header-level out-stream
)
656 (declare (type cons contents
))
657 (format out-stream
"<nav class=\"contents\"><div class=\"contents-head\">Contents</div><ul class=\"contents-list\">")
658 (loop for
(elem-level text id
) in contents do
659 (format out-stream
"<li class=\"toc-item-~A\"><a href=\"#~A\">~A</a></li>"
660 (- elem-level
(- min-header-level
1)) id
(clean-text-to-html text
)))
661 (format out-stream
"</ul></nav>"))
662 (style-hash-to-html (style-hash out-stream
)
663 (declare (type hash-table style-hash
))
664 (let ((style-list (alexandria:hash-table-keys style-hash
)))
666 (format out-stream
"<style>~{~A~}</style>" style-list
)))))
667 (declare (ftype (function (plump:node
&rest simple-string
) boolean
) only-child-is is-child-of-tag text-node-is-not
))
669 (((or plump
:invalid-xml-character plump
:discouraged-xml-character
) #'abort
))
671 (override (gethash post-id
*html-overrides
*))
673 (let ((root (plump:parse
(string-trim '(#\Space
#\Newline
#\Tab
#\Return
#\Linefeed
#\Page
) in-html
)))
677 (aggressive-deformat nil
)
678 (style-hash (make-hash-table :test
'equal
))
679 (used-colors (make-hash-table :test
'equal
))
680 (used-anchors (make-hash-table :test
'equal
)))
681 (declare (type fixnum section-count min-header-level
))
682 (when *before-clean-hook
*
683 (funcall *before-clean-hook
*))
684 (let ((wayward-li-container nil
))
689 ((not (plump:parent node
)) nil
)
692 ((not (plump:attribute node
"href"))
693 (move-children-out-of-node node
:keep t
))
695 (and (ppcre:scan
"^\s*https?://" (plump:text node
))
696 (not (find #\HORIZONTAL_ELLIPSIS
(plump:text node
))))
697 (notany (lambda (attr) (nonempty-string (plump:attribute node attr
))) '("href" "name" "id")))
698 (flatten-element node
))
700 (let* ((next-sibling (plump:next-sibling node
))
701 (next-text-node (if (plump:text-node-p next-sibling
) next-sibling
))
702 (next-next-sibling (if next-text-node
(plump:next-sibling next-text-node
) next-sibling
))
703 (next-a (if (and next-next-sibling
(tag-is next-next-sibling
"a")) next-next-sibling
)))
705 (or (not next-text-node
) (string-is-whitespace (plump:text next-text-node
)))
706 (string= (plump:attribute node
"href") (plump:attribute next-a
"href")))
708 (plump:remove-child next-text-node
)
709 (plump:append-child node next-text-node
))
710 (loop for c across
(plump:children next-a
)
711 do
(progn (plump:remove-child c
)
712 (plump:append-child node c
)))
713 (plump:remove-child next-a
)
715 ((tag-is node
"ul" "ol")
716 (setf wayward-li-container node
)
717 (let ((new-children (plump:make-child-array
)))
718 (loop for child across
(plump:children node
)
719 do
(if (and (plump:element-p child
) (tag-is child
"li"))
720 (vector-push-extend child new-children
)
721 (unless (and (plump:text-node-p child
) (string-is-whitespace (plump:text child
)))
722 (if (= (length new-children
) 0)
723 (vector-push-extend (plump:make-element node
"li") new-children
))
724 (plump:append-child
(aref new-children
(- (length new-children
) 1)) child
))))
725 (setf (plump:children node
) new-children
)))
727 (unless (is-child-of-tag node
"ul" "ol")
728 (unless wayward-li-container
729 (setf wayward-li-container
(make-element-before node
"ul")))
730 (plump:remove-child node
)
731 (plump:append-child wayward-li-container node
)))
732 ((tag-is node
"p" "blockquote" "div")
733 (setf wayward-li-container nil
))))
734 :test
#'plump
:element-p
))
735 (loop while
(and (= 1 (length (plump:children root
))) (plump:element-p
(plump:first-child root
)) (tag-is (plump:first-child root
) "div" "html" "body"))
736 do
(setf (plump:children root
) (plump:children
(plump:first-child root
)))
737 do
(loop for c across
(plump:children root
) do
(setf (plump:parent c
) root
))
738 do
(when-let (fc (plump:first-child root
))
739 (when (and (plump:element-p fc
) (tag-is fc
"head"))
740 (loop for c across
(plump:children fc
) do
741 (when (and (plump:element-p c
) (tag-is c
"style"))
742 (setf (plump:parent c
) (plump:parent fc
))
743 (plump:insert-after fc c
)))
744 (plump:remove-child fc
))))
745 (loop for c across
(plump:children root
) do
746 (when (and (plump:element-p c
)
748 (string-is-whitespace (plump:text c
)))
749 (move-children-out-of-node c
)))
750 (loop for lc
= (plump:last-child root
)
751 while
(and (plump:element-p lc
) (tag-is lc
"br"))
752 do
(plump:remove-child lc
))
756 (when (and (plump:text-node-p node
)
758 (text-node-is-not node
"a" "style" "pre"))
759 (scan-for-urls node
))))
763 (when (and (not (plump:root-p node
)) (plump:parent node
))
766 (when (and (text-node-is-not node
"style" "pre" "code")
767 (text-class-is-not node
"mjx-math"))
768 (let ((new-root (plump:parse
(clean-html-regexps (plump:serialize node nil
))))
769 (other-children (prog1
770 (subseq (plump:family node
) (1+ (plump:child-position node
)))
771 (setf (fill-pointer (plump:family node
)) (plump:child-position node
)))))
772 (loop for item across
(plump:children new-root
)
773 do
(plump:append-child
(plump:parent node
) item
))
774 (loop for item across other-children
775 do
(plump:append-child
(plump:parent node
) item
)))))
777 (alexandria:when-let
(style (plump:attribute node
"style"))
778 (let ((style-list (style-string-to-alist style
)))
779 (cond ((or aggressive-deformat
780 (cdr (assoc "font-family" style-list
:test
#'string-equal
))
781 (search "font-style: inherit" style
)
782 (search "MsoNormal" (plump:attribute node
"class")))
783 (setf aggressive-deformat t
)
784 (plump:remove-attribute node
"style"))
785 ((ppcre:scan
"(?:^|;)\\s*(?:line-height:[^;]+in)\\s*(?:;|$)" style
)
786 (plump:remove-attribute node
"style"))
789 (iter (for style-item in style-list
)
790 (when (member (car style-item
) '("color" "background-color") :test
#'string-equal
)
791 (multiple-value-bind (r g b a
) (decode-css-color (cdr style-item
))
793 (let ((color-name (safe-color-name r g b a
)))
795 (gethash color-name used-colors
) (list r g b a
)
796 (cdr style-item
) (format nil
"var(--user-color-~A)" color-name
)))))))
798 (setf (plump:attribute node
"style") (alist-to-style-string style-list
))))))))
799 (when (and aggressive-deformat
(tag-is node
"div"))
800 (setf (plump:tag-name node
) "p"))
801 (when (let ((class (plump:attribute node
"class"))
802 (parent (plump:parent node
)))
804 (or (search "mjx-math" class
)
805 (search "mjpage" class
))
807 (class-is-not parent
"mjx-math" "mjpage"))))
808 (loop for current
= node then
(plump:parent current
)
809 for parent
= (plump:parent current
)
810 when
(loop for s across
(plump:family current
)
811 unless
(or (eq s current
)
812 (and (plump:text-node-p s
) (string-is-whitespace (plump:text s
))))
814 do
(progn (add-class current
"mathjax-inline-container")
816 when
(or (plump:root-p parent
)
817 (tag-is parent
"p" "blockquote" "div"))
818 do
(progn (add-class current
"mathjax-block-container")
822 (vacuum-whitespace node
)
823 (let ((href (plump:attribute node
"href")))
825 (let* ((href (string-trim '(#\Space
#\Newline
#\Tab
#\Return
#\Linefeed
#\Page
) href
))
826 (href (if (ppcre:scan
"^(?:(?:[a-z]+:)?//|/|#)" href
) href
(format nil
"http://~A" href
)))
827 (href (or (with-direct-link (presentable-link href
)) href
)))
829 (setf (plump:attribute node
"href") href
)
831 (log-and-ignore-errors
832 (funcall *link-hook
* href
))))))))
834 (let ((width (ignore-errors (parse-integer (plump:attribute node
"width"))))
835 (height (ignore-errors (parse-integer (plump:attribute node
"height")))))
836 (if (and width height
(<= width
1) (<= height
1))
837 ;; Remove probable tracking pixel.
838 (plump:remove-child node
)
840 (if (and (tag-is (plump:parent node
) "div" "p" "figure")
841 (only-child-is (plump:parent node
) "img"))
842 (plump:parent node
) ; Should already have imgonly class.
843 (let ((container (wrap-element node
"div")))
844 (add-class container
"imgonly")
846 (when-let ((src (plump:attribute node
"src")))
847 (setf src
(presentable-link src
:image
)
848 (plump:attribute node
"src") src
)
849 (create-dynamic-call container
'lw2.images
::dynamic-image
851 (plump:tag-name container
)
852 (alexandria:hash-table-alist
(plump:attributes container
))
853 (alexandria:hash-table-alist
(plump:attributes node
))))
854 (when (and width height
)
855 ;; Apply responsive image scaling CSS.
856 (setf (plump:attribute container
"style") (format nil
"--aspect-ratio: ~F; max-width: ~Dpx"
860 (remove-attributes node
"style" "class" "width" "height")
861 (setf (plump:attribute node
"loading") "lazy")))))
862 ((tag-is node
"figure")
863 (remove-attributes node
"style" "class" "width" "height"))
864 ((and (tag-is node
"p") (only-child-is node
"figure"))
865 (move-children-out-of-node node
))
866 ((tag-is node
"p" "blockquote" "div" "center")
867 (when (only-child-is node
"center")
868 (unless (string-is-whitespace (plump:text node
))
869 (add-element-style node
"text-align" "center"))
870 (move-children-out-of-node (plump:first-child node
)))
871 (when (tag-is node
"center")
872 (setf (plump:tag-name node
) "p")
873 (add-element-style node
"text-align" "center"))
874 (when-let ((question-id (plump:attribute node
"data-elicit-id")))
875 (when-let* ((question-title (lw2.backend
::get-elicit-question-title question-id
))
876 (links (plump:get-elements-by-tag-name node
"a"))
877 (text-node (plump:first-child
(first links
))))
878 (setf (plump:text text-node
) question-title
))
879 (create-dynamic-call (plump:parent
(plump:parent node
)) 'lw2.elicit-predictions
::render-elicit-block question-id
))
880 (if (string-is-whitespace (plump:text node
))
881 (if (or (plump:get-elements-by-tag-name node
"img")
882 (plump:get-elements-by-tag-name node
"iframe"))
883 (add-class node
"imgonly")
884 (plump:remove-child node
))
885 (if-let (parent (plump:parent node
))
886 (labels ((spoilerp (n)
887 (if-let (a (and (plump:element-p n
) (plump:attribute n
"class")))
888 (ppcre:scan
"(?:^| )spoiler\\S*(?: |$)" a
))))
889 (when (and nil
(tag-is node
"p") ;; FIXME: disabled until we can fix math and code false positives
890 (rot13-text-p (plump:text node
)))
891 (setf (plump:attribute node
"class") "spoiler")
893 (lambda (n) (unrot13-by-words (plump:text n
)))
894 :test
#'plump
:text-node-p
))
896 ((and (tag-is node
"p")
899 (plump:remove-attribute node
"class"))
900 ((and (tag-is node
"div")
902 (setf (plump:attribute node
"class") "spoiler"))
903 ((and (spoilerp node
)
905 (not (spoilerp parent
)))
906 (let ((previous-sibling (plump:previous-sibling node
)))
907 (if (and previous-sibling
(spoilerp previous-sibling
))
908 (progn (plump:remove-child node
)
909 (plump:append-child previous-sibling node
)
910 (plump:remove-attribute node
"class"))
911 (let ((new-container (plump:make-element parent
"div")))
912 (setf (plump:attribute new-container
"class") "spoiler")
913 (plump:remove-child new-container
)
914 (setf (plump:parent new-container
) (plump:parent node
))
915 (plump:insert-before node new-container
)
916 (loop for e
= node then ns
917 while
(and (plump:element-p e
) (spoilerp e
))
918 for ns
= (plump:next-sibling e
)
920 (plump:remove-attribute e
"class")
921 (plump:remove-child e
)
922 (plump:append-child new-container e
))))))))))))
923 ((tag-is node
"table" "tbody" "tr" "td")
924 (remove-style-rules node
"border-top" "border-bottom" "border-left" "border-right" "padding")
925 (when (and (tag-is node
"td") (is-numeric node
))
926 (add-class node
"numeric")))
928 (let ((parent (plump:parent node
)))
930 ((and (or (plump:root-p parent
) (and (plump:element-p parent
) (tag-is parent
"p" "blockquote" "div")))
931 (loop for c across
(plump:children node
) never
(and (plump:element-p c
) (tag-is c
"a"))))
932 (vacuum-whitespace node
))
934 (move-children-out-of-node node
)))))
936 (when-let (old-style (plump:attribute node
"style"))
937 (setf (plump:attribute node
"style")
938 (ppcre:regex-replace-all
939 (load-time-value (ppcre:create-scanner
"list-style-type\\s*:\\s*decimal\\s*;?" :single-line-mode t
:case-insensitive-mode t
))
942 (when-let (start-string (plump:attribute node
"start"))
943 (when-let (start (ignore-errors (parse-integer start-string
)))
944 (plump:remove-attribute node
"start")
945 (add-element-style node
"counter-reset" (format nil
"ol ~A" (- start
1))))))
947 (when (let ((c (plump:first-child node
))) (and c
(if (plump:text-node-p c
) (not (string-is-whitespace (plump:text c
))) (not (tag-is c
"p" "ul" "ol")))))
948 (wrap-children node
"p")))
950 (let ((nchildren (length (plump:children node
))))
951 (when (>= nchildren
1)
952 (remove-if-whitespace (plump:first-child node
))
953 (when (>= nchildren
2)
954 (remove-if-whitespace (plump:last-child node
))))))
955 ((ppcre:scan
"^h[1-6]$" (plump:tag-name node
))
956 (when (plump:get-elements-by-tag-name node
"p")
957 (move-children-out-of-node node
))
959 ((string-is-whitespace (plump:text node
))
960 (plump:remove-child node
))
962 (let ((fc (plump:first-child node
))
963 (lc (plump:last-child node
)))
964 (when (and (plump:element-p fc
) (tag-is fc
"br")) (plump:remove-child fc
))
965 (when (and (not (eql fc lc
)) (plump:element-p lc
) (tag-is lc
"br")) (plump:remove-child lc
)))
968 (unless (plump:attribute node
"id") (setf (plump:attribute node
"id") (format nil
"section-~A" section-count
)))
969 (let* ((header-level (parse-integer (subseq (plump:tag-name node
) 1)))
970 (header-text (with-output-to-string (stream)
975 (when (text-node-is-not n
"style" "script")
976 (write-string (plump:text n
) stream
))))))))
977 (anchor-old (or (plump:attribute node
"id") (format nil
"section-~A" section-count
)))
978 (anchor-new (title-to-anchor header-text used-anchors
))
979 (wrapper (wrap-children node
"span")))
980 (setf min-header-level
(min min-header-level header-level
)
981 (plump:attribute node
"id") anchor-new
982 (plump:attribute wrapper
"id") anchor-old
)
983 (push (list header-level
987 ((and (tag-is node
"span") (find "footnote-back-link" (class-list node
) :test
#'string-equal
))
988 (plump:traverse node
#'move-children-out-of-node
:test
(lambda (n) (tag-is n
"sup" "strong"))))
989 ((tag-is node
"style")
990 (let ((text (plump:text node
)))
991 (when (search ".mjx-math" text
)
992 (setf (gethash text style-hash
) t
)))
993 (plump:remove-child node
))
994 ((tag-is node
"script")
995 (plump:remove-child node
))))))))
996 (clean-dom-text root
)
997 (let ((with-toc (>= section-count
3))
998 (out-string (make-array 0 :element-type
'character
:fill-pointer
0 :adjustable t
)))
999 (with-output-to-string (out-stream out-string
)
1000 (style-hash-to-html style-hash out-stream
)
1001 (when (> (hash-table-count used-colors
) 0)
1002 (format out-stream
"<style>~%:root {~%")
1003 (maphash (lambda (name rgba-list
)
1004 (declare (ignore rgba-list
))
1005 (format out-stream
" --user-color-~A: #~A;~%" name name
))
1007 (flet ((write-inverted-colors (theme)
1008 (format out-stream
"body.theme-~A {~%" theme
)
1009 (maphash (lambda (name rgba-list
)
1010 (format out-stream
" --user-color-~A: ~A;~%" name
1011 (multiple-value-call #'encode-css-color
(apply #'perceptual-invert-rgba rgba-list
))))
1013 (format out-stream
"}~%")))
1014 (format out-stream
"}~%@media (prefers-color-scheme: dark) {~%")
1015 (write-inverted-colors "default")
1016 (format out-stream
"}~%")
1017 (write-inverted-colors "dark"))
1018 (format out-stream
"</style>"))
1019 (with-dynamic-block-serialization (current-memo-hash out-string
)
1020 (loop for c across
(plump:children root
)
1022 (not (or (string-is-whitespace (plump:text c
))
1023 (tag-is c
"figure"))))
1025 (contents-to-html (nreverse contents
) min-header-level out-stream
)
1026 (setf with-toc nil
))
1027 do
(plump:serialize c out-stream
)))