Add some defenses against untitled posts.
[lw2-viewer.git] / src / clean-html.lisp
blobe468ec4b0e15cedc4b61124476303b1dbb626e68
1 (uiop:define-package #:lw2.clean-html
2 (:use #:cl #:alexandria #:iterate #:split-sequence #:lw2.lmdb #:lw2.links #:lw2.utils #:lw2.context #:lw2.sites #:lw2.conditions #:lw2.colors)
3 (:export #:*before-clean-hook* #:*link-hook* #:url-scanner #:clean-text #:clean-text-to-html #:clean-html #:clean-html* #:extract-excerpt #:extract-excerpt*)
4 (:unintern #:*text-clean-regexps* #:*html-clean-regexps*))
6 (in-package #:lw2.clean-html)
8 (setf cl-typesetting-hyphen::*hyphen-patterns-directory* (asdf:system-relative-pathname "lw2-viewer" "data/hyphenation-patterns/"))
9 (setf cl-typesetting-hyphen::*language-hyphen-file-list* '((:en-us . "hyph_en_US")))
10 (cl-typesetting-hyphen:load-language :en-us)
11 (setf cl-typesetting::*default-hyphen-language* :en-us)
13 (defvar *before-clean-hook* nil)
14 (defvar *link-hook* nil)
16 (defun file-get-contents (filename)
17 (with-open-file (stream filename)
18 (uiop:slurp-stream-string stream)))
20 (defun grab-from-rts (url)
21 (declare (optimize (speed 0) (space 3)))
22 (let* ((root (plump:parse (dex:get url :keep-alive nil)))
23 (post-body (plump:get-element-by-id root "wikitext")))
24 (loop for cls in '("div.nav_menu" "div.imgonly" "div.bottom_nav") do
25 (loop for e across (clss:select cls post-body)
26 do (plump:remove-child e)))
27 (plump:remove-child (elt (clss:select "h1" post-body) 0))
28 (plump:remove-child (elt (clss:select "p" post-body) 0))
29 (with-open-file (stream (merge-pathnames "./rts-content/" (subseq (quri:uri-path (quri:uri url)) 1)) :direction :output :if-does-not-exist :create :external-format :utf-8)
30 (plump:serialize post-body stream))))
32 (defun rts-to-html (file)
33 (declare (optimize (speed 0) (space 3)))
34 (concatenate 'string
35 "<style>"
36 (file-get-contents "./rts-content/rts.css")
37 "</style>"
38 (file-get-contents (merge-pathnames "./rts-content/" file))))
40 (defparameter *html-overrides* (make-hash-table :test 'equal))
41 (loop for (id file) in '(("XTXWPQSEgoMkAupKt" "An-Intuitive-Explanation-Of-Bayess-Theorem")
42 ("afmj8TKAqH6F2QMfZ" "A-Technical-Explanation-Of-Technical-Explanation")
43 ("7ZqGiPHTpiDMwqMN2" "The-Twelve-Virtues-Of-Rationality"))
44 do (let ((file* file)) (setf (gethash id *html-overrides*) (lambda () (rts-to-html file*)))))
46 (defmacro do-with-cleaners ((regexp-list scanner replacement) &body body)
47 (declare (optimize (speed 0) (space 3)))
48 `(labels ((fn (,scanner ,replacement) ,@body))
49 ,@(loop for (regex flags replacement) in (eval regexp-list)
50 collecting `(fn (load-time-value
51 (ppcre:create-scanner ,regex
52 ,@(loop for (flag sym) in '((#\i :case-insensitive-mode)
53 (#\m :multi-line-mode)
54 (#\s :single-line-mode)
55 (#\x :extended-mode))
56 when (find flag flags)
57 append (list sym t))))
58 ,replacement))))
60 (defmacro define-cleaner (name regexp-list)
61 (declare (optimize (speed 0) (space 3)))
62 `(defun ,name (text)
63 (do-with-cleaners (,regexp-list scanner replacement)
64 (setf text (ppcre:regex-replace-all scanner text replacement)))))
66 (eval-when (:compile-toplevel :load-toplevel :execute)
67 (defun read-regexp-file (filename)
68 (declare (optimize (speed 0) (space 3)))
69 (let ((data (destructuring-bind (* ((* (* inner))))
70 (with-open-file (stream (uiop:subpathname (asdf:system-source-directory "lw2-viewer") filename)) (parse-js:parse-js stream))
71 inner)))
72 (loop for input in data
73 collecting (destructuring-bind (* ((* regex flags) (* replacement))) input
74 (list regex flags (ppcre:regex-replace-all "\\$(\\d)" replacement "\\\\\\1")))))))
76 (define-cleaner clean-text (read-regexp-file "text-clean-regexps.js"))
77 (define-cleaner clean-html-regexps (read-regexp-file "html-clean-regexps.js"))
79 (declaim (ftype function url-scanner))
80 (eval-when (:compile-toplevel :load-toplevel :execute)
81 (setf (fdefinition 'url-scanner) (ppcre:create-scanner
82 "(?:https?://[-a-zA-Z0-9]+\\.[-a-zA-Z0-9.]+|[-a-zA-Z0-9.]+\\.(?:com|edu|gov|mil|net|org|int|biz|info|name|museum|us|ca|uk|io|ly))(?:\\:[0-9]+){0,1}(?:(?:/|\\?(?!(?:$|\\s)))(?:(?:(\\()|\\)(?![.,;:?!]?(?:$|\\s))|[-\\w\\d.,;:?'\\\\+@!&%$#=~–_/])*(?(1)[-\\w\\d\\\\+@&%$#=~_/)]|[-\\w\\d\\\\+@&%$#=~_/]))?)?"
83 :single-line-mode t)))
85 (defun hyphenate-string (string)
86 (let ((hyphenation-list (cl-typesetting::hyphenate-string string)))
87 (declare (type (and string (not base-string)) string)
88 (type list hyphenation-list))
89 (if hyphenation-list
90 (let ((new-string (make-array (+ (length string) (length hyphenation-list)) :element-type 'character :fill-pointer 0)))
91 (loop for char across string
92 for orig-offset of-type fixnum from 0
93 with current-hyphenation = hyphenation-list
94 do (when (and current-hyphenation (= orig-offset (the fixnum (first current-hyphenation))))
95 (vector-push #\SOFT_HYPHEN new-string)
96 (setf current-hyphenation (rest current-hyphenation)))
97 do (vector-push char new-string))
98 (values new-string hyphenation-list))
99 (values string nil))))
101 (defun clean-text-to-html (text &key (hyphenation t))
102 (handler-bind
103 (((or plump:invalid-xml-character plump:discouraged-xml-character) #'abort))
104 (clean-html-regexps
105 (plump:encode-entities
106 (coerce
107 (funcall (if hyphenation #'hyphenate-string #'identity) (clean-text text))
108 'simple-string)))))
110 (defun rot13-char (char)
111 (let ((char-code (char-code char)))
112 (labels ((translate-char (base)
113 (code-char (+ base (mod (+ 13 (- char-code base)) 26)))))
114 (declare (dynamic-extent #'translate-char))
115 (cond
116 ((<= (char-code #\A) char-code (char-code #\Z))
117 (translate-char (char-code #\A)))
118 ((<= (char-code #\a) char-code (char-code #\z))
119 (translate-char (char-code #\a)))
120 (t char)))))
122 (declaim (inline letter-index))
123 (defun letter-index (char)
124 (let ((char-code (char-code char)))
125 (cond
126 ((<= (char-code #\A) char-code (char-code #\Z))
127 (- char-code (char-code #\A)))
128 ((<= (char-code #\a) char-code (char-code #\z))
129 (- char-code (char-code #\a))))))
131 (defparameter *letter-frequencies*
132 '((#\A . 8.167) (#\B . 1.492) (#\C . 2.782) (#\D . 4.253) (#\E . 12.702)
133 (#\F . 2.228) (#\G . 2.015) (#\H . 6.094) (#\I . 6.966) (#\J . 0.153)
134 (#\K . 0.772) (#\L . 4.025) (#\M . 2.406) (#\N . 6.749) (#\O . 7.507)
135 (#\P . 1.929) (#\Q . 0.095) (#\R . 5.987) (#\S . 6.327) (#\T . 9.056)
136 (#\U . 2.758) (#\V . 0.978) (#\W . 2.36) (#\X . 0.15) (#\Y . 1.974)
137 (#\Z . 0.074)))
139 (defparameter *letter-rot13-log-odds*
140 (let ((array (make-array 26 :element-type 'single-float)))
141 (loop
142 for letter-freq in *letter-frequencies*
143 for rot13-freq in (concatenate 'list (subseq *letter-frequencies* 13 26) (subseq *letter-frequencies* 0 13))
144 for i from 0
145 do (setf (aref array i) (log (/ (cdr letter-freq) (cdr rot13-freq)) 2)))
146 array))
148 (defparameter *bigram-rot13-log-odds*
149 (let ((bigram-data (with-open-file (stream (asdf:system-relative-pathname "lw2-viewer" "data/bigrams.lisp"))
150 (read stream)))
151 (bigram-table (make-array '(26 26) :element-type 'single-float :initial-element 1.0))
152 (bigram-log-odds (make-array '(26 26) :element-type 'single-float)))
153 (loop
154 for (bigram count) in bigram-data
155 do (setf (aref bigram-table
156 (letter-index (aref bigram 0))
157 (letter-index (aref bigram 1)))
158 (float (+ count 1))))
159 (dotimes (i 26)
160 (dotimes (j 26)
161 (setf (aref bigram-log-odds i j)
162 (log (/ (aref bigram-table i j)
163 (aref bigram-table
164 (mod (+ 13 i) 26)
165 (mod (+ 13 j) 26)))
166 2))))
167 bigram-log-odds))
169 (defun rot13-text-p (text &optional (start 0) (end (length text)))
170 (declare (type simple-string text))
171 (let ((log-odds
172 (cond
173 ((= 0 (- end start))
174 0.0)
175 ((= 1 (- end start))
176 (let ((odds-table (load-time-value *letter-rot13-log-odds*)))
177 (declare (type (simple-array single-float (26)) odds-table))
178 (if-let ((letter-index (letter-index (aref text start))))
179 (aref odds-table letter-index)
180 0.0)))
182 (let ((odds-table (load-time-value *bigram-rot13-log-odds*)))
183 (declare (type (simple-array single-float (26 26)) odds-table))
184 (loop
185 with sum = 0.0
186 for i from (1+ start) to (1- end)
187 do (when-let ((a (letter-index (aref text (1- i))))
188 (b (letter-index (aref text i))))
189 (setf sum (+ sum (aref odds-table a b))))
190 finally (return sum)))))))
191 (values
192 (< log-odds (log 0.0001 2))
193 log-odds)))
195 (defun rot13-inplace (text &optional (start 0) (end (1- (length text))))
196 (declare (type simple-string text))
197 (loop
198 for i from start to end
199 do (setf (aref text i) (rot13-char (aref text i))))
200 text)
202 (defun unrot13-by-words (text)
203 (declare (type simple-string text))
204 (loop
205 with word-start = 0
206 for i from 0
207 for char across text
208 when (or (= i (1- (length text)))
209 (position char ",.:;?! "))
210 do (progn
211 (when (< (nth-value 1 (rot13-text-p text word-start i)) (+ 0 (- i word-start 0)))
212 (rot13-inplace text word-start i))
213 (setf word-start i)))
214 text)
216 (declaim (ftype (function (plump:node &rest simple-string) boolean) tag-is class-is-not text-class-is-not))
218 (defun tag-is (node &rest args)
219 (declare (type plump:node node)
220 (dynamic-extent args))
221 (when (plump:element-p node)
222 (let ((tag (plump:tag-name node)))
223 (to-boolean
224 (some (lambda (x) (string= tag x))
225 args)))))
227 (defun every-ancestor (node test)
228 (declare (type plump:node node)
229 (type function test))
230 (iter (for target first node then (plump:parent target))
231 (cond ((or (plump:root-p target) (null target)) (return t))
232 ((not (funcall test target)) (return nil)))))
234 (defun class-is-not (node &rest args)
235 (declare (type plump:node node)
236 (dynamic-extent args))
237 (every-ancestor node (lambda (n)
238 (not (intersection
239 (split-sequence #\Space (or (plump:attribute n "class") ""))
240 args
241 :test #'string=)))))
243 (defun text-class-is-not (node &rest args)
244 (declare (type plump:node node)
245 (dynamic-extent args))
246 (apply #'class-is-not (plump:parent node) args))
248 (defun clean-dom-text (root)
249 (handler-bind
250 (((or plump:invalid-xml-character plump:discouraged-xml-character) #'abort))
251 (labels
252 ((recursep (node)
253 (and (plump:element-p node)
254 (ppcre:scan "^(?:p|div|blockquote|li|h[0-6])$" (plump:tag-name node))))
255 (cleanablep (node)
256 (and (plump:text-node-p node)
257 (plump:parent node)
258 (text-class-is-not node "mjx-math" "arbital-math")
259 (not (tag-is (plump:parent node) "code"))))
260 (traverse (node main-fn &optional recurse-fn)
261 (when (cleanablep node) (funcall main-fn node))
262 (when (plump:nesting-node-p node)
263 (loop for n across (plump:children node)
264 do (if (recursep n)
265 (if recurse-fn (funcall recurse-fn n))
266 (traverse n main-fn recurse-fn))))))
267 (let* ((offset-list nil)
268 (whole-string-input
269 (with-output-to-string (stream)
270 (traverse
271 root
272 (lambda (node)
273 (push (length (the string (plump:text node))) offset-list)
274 (write-string (plump:text node) stream))
275 #'clean-dom-text)))
276 (whole-string-output whole-string-input))
277 (declare (type string whole-string-output whole-string-input))
278 (setf offset-list (nreverse offset-list))
279 (labels
280 ((call-with-offset-loop (continue-fn loop-fn next-offset-fn offset-adjust-fn)
281 (loop with current-offset = offset-list
282 with output-offset of-type (or null fixnum) = (first current-offset)
283 with output-offset-list = nil
284 with total-offset of-type fixnum = 0
285 while (funcall continue-fn)
286 do (funcall loop-fn)
287 do (loop for current-offset-num of-type fixnum = (first current-offset)
288 while (and (rest current-offset) (< (+ total-offset current-offset-num) (funcall next-offset-fn)))
289 do (progn
290 (push output-offset output-offset-list)
291 (setf total-offset (+ total-offset current-offset-num)
292 current-offset (cdr current-offset)
293 output-offset (first current-offset))))
294 do (setf output-offset (funcall offset-adjust-fn output-offset))
295 finally (progn
296 (push output-offset output-offset-list)
297 (loop for x in (rest current-offset) do (push x output-offset-list))
298 (setf offset-list (nreverse output-offset-list))))
299 (values)))
300 (declare (dynamic-extent (function call-with-offset-loop))
301 (ftype (function ((function ()) (function ()) (function () fixnum) (function (fixnum) fixnum)) (values)) call-with-offset-loop))
302 (macrolet
303 ((offset-loop ((list-binding list-form) (&body loop-body) (&body next-offset-body) (&body offset-adjust-body))
304 (with-gensyms (list-current)
305 `(let ((,list-current ,list-form)
306 (,list-binding))
307 (labels ((continue-fn () (if ,list-current (setf ,list-binding (pop ,list-current))))
308 (loop-fn () ,.loop-body)
309 (next-offset-fn () ,.next-offset-body)
310 (offset-adjust-fn ,(first offset-adjust-body) (declare (type fixnum ,(caar offset-adjust-body)) (values fixnum)) ,.(rest offset-adjust-body)))
311 (declare (dynamic-extent (function continue-fn) (function loop-fn) (function next-offset-fn) (function offset-adjust-fn)))
312 (call-with-offset-loop #'continue-fn #'loop-fn #'next-offset-fn #'offset-adjust-fn))))))
313 (do-with-cleaners ((read-regexp-file "text-clean-regexps.js") scanner replacement)
314 (let ((replacements 0)
315 (replacement-list nil)
316 (original-length (length whole-string-output)))
317 (declare (type fixnum replacements))
318 (ppcre:do-scans (match-start match-end reg-starts reg-ends scanner whole-string-output)
319 (declare (type fixnum match-start match-end)
320 (type simple-vector reg-starts reg-ends))
321 (incf replacements)
322 (push
323 (list (if (and (> (length reg-starts) 0) (eq (aref reg-starts 0) match-start))
324 (aref reg-ends 0)
325 match-start)
326 (if (and (> (length reg-starts) 0) (eq (aref reg-ends (- (length reg-ends) 1)) match-end))
327 (aref reg-starts (- (length reg-starts) 1))
328 match-end))
329 replacement-list))
330 (setf replacement-list (nreverse replacement-list))
331 (setf whole-string-output (ppcre:regex-replace-all scanner whole-string-output replacement))
332 (let ((length-difference (- (length whole-string-output) original-length))
333 (length-change 0))
334 (declare (type fixnum length-difference length-change))
335 (offset-loop
336 (current-replacement replacement-list)
337 ((setf length-change (ceiling length-difference replacements)
338 length-difference (- length-difference length-change)
339 replacements (- replacements 1)))
340 ((destructuring-bind (start end) current-replacement
341 (declare (type fixnum start end))
342 (ceiling (+ start end) 2)))
343 ((output-offset) (max 0 (+ output-offset length-change)))))))
344 (multiple-value-bind (hyphenated-string hyphenation-list) (hyphenate-string whole-string-output)
345 (setf whole-string-output hyphenated-string)
346 (offset-loop
347 (current-hyphenation hyphenation-list)
349 (current-hyphenation)
350 ((output-offset) (1+ output-offset))))))
351 (let ((current-offset 0))
352 (declare (type (or null fixnum) current-offset))
353 (traverse
354 root
355 (lambda (node)
356 (let ((output-length (length whole-string-output))
357 (next-offset (if offset-list (+ current-offset (the fixnum (first offset-list))) nil)))
358 (declare (type (or null fixnum) next-offset))
359 (setf (plump:text node) (subseq whole-string-output (min current-offset output-length) (and next-offset (min next-offset output-length)))
360 current-offset next-offset
361 offset-list (cdr offset-list))))
362 (lambda (node) (declare (ignore node))))))))
363 root)
365 ;;;; Dynamic content blocks.
367 (define-cache-database 'lw2.backend-modules:backend-lmdb-cache "dynamic-content-blocks")
369 (defparameter *dynamic-content-block-callback* nil)
371 (defclass dynamic-element (plump:element)
372 ((dynamic-call-form :initarg :dynamic-call-form :accessor dynamic-call-form)))
374 (defmethod plump:serialize-object :around ((node dynamic-element))
375 (funcall *dynamic-content-block-callback* :start node)
376 (call-next-method)
377 (funcall *dynamic-content-block-callback* :end node))
379 (defun create-dynamic-call (node function-name &rest args)
380 (change-class node 'dynamic-element :dynamic-call-form (list* function-name args)))
382 (defun call-with-dynamic-block-serialization (fn hash output-string)
383 (let* ((current-start nil)
384 (last-string-pos 0)
385 (last-octet-pos 0))
386 (flet ((current-octet-pos ()
387 (psetf last-string-pos (length output-string)
388 last-octet-pos (+ last-octet-pos (babel:string-size-in-octets output-string :start last-string-pos)))
389 last-octet-pos))
390 (declare (dynamic-extent #'current-octet-pos))
391 (let* ((dynamic-call-list nil)
392 (*dynamic-content-block-callback*
393 (lambda (phase node)
394 (case phase
395 (:start (setf current-start (current-octet-pos)))
396 (:end (let ((current-end (current-octet-pos)))
397 (push (list* current-start current-end (dynamic-call-form node))
398 dynamic-call-list)))))))
399 (declare (dynamic-extent *dynamic-content-block-callback*))
400 (multiple-value-prog1
401 (funcall fn)
402 (when dynamic-call-list
403 (cache-put "dynamic-content-blocks" hash (nreverse dynamic-call-list) :key-type :byte-vector :value-type :lisp)))))))
405 (defmacro with-dynamic-block-serialization ((hash output-string) &body body)
406 `(dynamic-flet ((fn () ,@body)) (call-with-dynamic-block-serialization #'fn ,hash ,output-string)))
408 ;;;;
410 (define-lmdb-memoized extract-excerpt 'lw2.backend-modules:backend-lmdb-cache
411 (:sources ("src/clean-html.lisp")) (in-html)
412 (let ((root (plump:parse (string-trim '(#\Space #\Newline #\Tab #\Return #\Linefeed #\Page) in-html)))
413 (chars 0)
414 (need-space nil))
415 (with-output-to-string (out-stream)
416 (block nil
417 (plump:traverse
418 root
419 (lambda (node)
420 (when (or (> (length (plump:children node)) 1)
421 (plump:text-node-p (plump:first-child node)))
422 (let ((text (plump:text node)))
423 (when need-space
424 (write-char #\Space out-stream))
425 (write-string text out-stream)
426 (setf chars (+ chars (length text))
427 need-space t)
428 (when (> chars 480)
429 (return nil)))))
430 :test (lambda (node) (tag-is node "p")))))))
432 (define-lmdb-memoized clean-html 'lw2.backend-modules:backend-lmdb-cache
433 (:sources ("src/clean-html.lisp" "src/links.lisp" "src/colors.lisp" "text-clean-regexps.js" "html-clean-regexps.js")) (in-html &key with-toc post-id)
434 (declare (ftype (function (plump:node) fixnum) plump:child-position)
435 (ftype (function (plump:node) (and vector (not simple-array))) plump:family)
436 (ftype (function (plump:node) simple-string) plump:text plump:tag-name))
437 (labels ((only-child-is (node &rest args)
438 (declare (dynamic-extent args))
439 (and (= 1 (length (plump:children node)))
440 (let ((child (plump:first-child node)))
441 (and
442 (plump:element-p child)
443 (apply #'tag-is child args)))))
444 (is-child-of-tag (node &rest args)
445 (declare (dynamic-extent args))
446 (loop for e = (plump:parent node) then (plump:parent e)
447 while (not (typep e 'plump:root))
448 when (and (plump:element-p e) (apply #'tag-is (cons e args))) return t))
449 (add-class (node class)
450 (declare (type plump:node node)
451 (type string class))
452 (let ((classes (adjoin class (alexandria:if-let (attr (plump:attribute node "class")) (split-sequence #\Space attr)) :test #'string=)))
453 (declare (dynamic-extent classes))
454 (setf (plump:attribute node "class") (format nil "~{~A~^ ~}" classes)))
455 node)
456 (remove-attributes (node &rest attrs)
457 (declare (dynamic-extent attrs))
458 (dolist (attr attrs)
459 (plump:remove-attribute node attr)))
460 (make-element-before (node tag)
461 (if (plump:text-node-p node)
462 (make-element-before (plump:parent node) tag)
463 (let ((e (plump:make-element (plump:parent node) tag)))
464 (plump:remove-child e)
465 (plump:insert-before node e)
466 (setf (plump:parent e) (plump:parent node))
467 e)))
468 (wrap-element (node element-name)
469 (let ((container (make-element-before node element-name)))
470 (plump:remove-child node)
471 (plump:append-child container node)
472 container))
473 (wrap-children (node element-name)
474 (let ((new-element (plump:make-element node element-name)))
475 (plump:remove-child new-element)
476 (setf (plump:children new-element) (plump:clone-children node t new-element)
477 (plump:children node) (plump:make-child-array))
478 (plump:append-child node new-element)))
479 (move-children-out-of-node (node &key keep)
480 (iterate (for c in-vector (plump:children node) downto 0)
481 (setf (plump:parent c) (plump:parent node))
482 (plump:insert-after node c))
483 (if keep
484 (setf (plump:children node) (plump:make-child-array))
485 (plump:remove-child node)))
486 (text-node-is-not (node &rest args)
487 (declare (type plump:node node)
488 (dynamic-extent args))
490 (typep (plump:parent node) 'plump:root)
491 (every (lambda (x) (string/= (plump:tag-name (plump:parent node)) x)) args)))
492 (adjacent-text-node (node direction)
493 (multiple-value-bind (get-sibling insert-sibling)
494 (ecase direction
495 (:previous (values #'plump:previous-sibling #'plump:insert-before))
496 (:next (values #'plump:next-sibling #'plump:insert-after)))
497 (let ((candidate (funcall get-sibling node)))
498 (if (plump:text-node-p candidate)
499 candidate
500 (let ((new-node (plump:make-text-node (plump:parent node))))
501 (funcall insert-sibling node new-node)
502 new-node)))))
503 (is-numeric (node)
504 (let ((text (plump:text node))
505 (alpha 0)
506 (digit 0))
507 (loop for c across text
508 when (alpha-char-p c)
509 do (incf alpha)
510 when (digit-char-p c)
511 do (incf digit)
512 finally (return (>= digit alpha)))))
513 (char-is-whitespace (c)
514 (or (cl-unicode:has-binary-property c "White_Space")
515 (eql c #\BRAILLE_PATTERN_BLANK)))
516 (string-is-whitespace (string)
517 (every #'char-is-whitespace string))
518 (remove-if-whitespace (node)
519 (when (string-is-whitespace (plump:text node))
520 (plump:remove-child node)))
521 (first-non-whitespace-child (node)
522 (loop for e across (plump:children node)
523 when (or (typep e 'plump:element) (not (string-is-whitespace (plump:text e)))) return e))
524 (find-text-node (node direction)
525 (let ((iterator #.`(case direction
526 ,@(loop for d in '(:first :last) collect
527 `(,d (lambda (fn)
528 (iterate (for c in-vector (plump:children node) ,@(if (eq d :last) '(downto 0)))
529 (funcall fn c))))))))
530 (declare (dynamic-extent iterator))
531 (cond
532 ((and (plump:text-node-p node) (plump:parent node))
533 node)
534 ((plump:nesting-node-p node)
535 (block nil
536 (funcall iterator (lambda (c)
537 (when-let (tn (find-text-node c direction)) (return tn)))))))))
538 (vacuum-whitespace (node)
539 (dolist (direction '(:first :last))
540 (let ((displaced-text (make-string 0)))
541 (loop for lt = (find-text-node node direction) do
542 (cond
543 ((not lt)
544 (return-from vacuum-whitespace node))
545 ((string-is-whitespace (plump:text lt))
546 (setf displaced-text (case direction
547 (:first (concatenate 'string displaced-text (plump:text lt)))
548 (:last (concatenate 'string (plump:text lt) displaced-text))))
549 (plump:remove-child lt))
551 (let* ((text (plump:text lt)))
552 (case direction
553 (:first
554 (let ((boundary (loop for i from 0 to (- (length text) 1)
555 unless (char-is-whitespace (aref text i))
556 return i)))
557 (setf displaced-text (concatenate 'string displaced-text (subseq text 0 boundary))
558 (plump:text lt) (subseq text boundary))))
559 (:last
560 (let ((boundary (loop for i from (- (length text) 1) downto 0
561 unless (char-is-whitespace (aref text i))
562 return i)))
563 (setf displaced-text (concatenate 'string (subseq text (+ 1 boundary) (length text)))
564 (plump:text lt) (subseq text 0 (+ 1 boundary)))))))
565 (return))))
566 (when (> (length displaced-text) 0)
567 (let ((atn (adjacent-text-node node (case direction (:first :previous) (:last :next)))))
568 (setf (plump:text atn) (case direction
569 (:first (concatenate 'string (plump:text atn) displaced-text))
570 (:last (concatenate 'string displaced-text (plump:text atn))))))))))
571 (add-element-style (node attribute value)
572 (let ((old-style (plump:attribute node "style")))
573 (setf (plump:attribute node "style")
574 (if old-style
575 (format nil "~A~:[;~;~] ~A: ~A;" old-style (ppcre:scan ";\s*$" old-style) attribute value)
576 (format nil "~A: ~A;" attribute value)))))
577 (style-string-to-alist (string)
578 (let ((rules (ppcre:split "\\s*;\\s*" string :sharedp t)))
579 (iter (for rule in rules)
580 (let ((parts (ppcre:split "\\s*:\\s*" rule :sharedp t)))
581 (when (= 2 (length parts))
582 (collect (cons (first parts) (second parts))))))))
583 (alist-to-style-string (alist)
584 (with-output-to-string (s)
585 (iter (for item in alist)
586 (format s "~A:~A;" (car item) (cdr item)))))
587 (remove-style-rules (node &rest rules)
588 (declare (dynamic-extent rules))
589 (when-let ((old-style (plump:attribute node "style")))
590 (setf (plump:attribute node "style")
591 (alist-to-style-string
592 (remove-if (lambda (x) (member (car x) rules :test #'string-equal))
593 (style-string-to-alist old-style))))))
594 (flatten-element (node)
595 (let* ((previous-sibling (plump:previous-sibling node))
596 (next-sibling (if (plump:text-node-p (plump:next-sibling node))
597 (plump:next-sibling node)))
598 (new-text-node (if (plump:text-node-p previous-sibling)
599 previous-sibling
600 (plump:insert-before node
601 (plump:remove-child (plump:make-text-node (plump:parent node)))))))
602 (setf (plump:parent new-text-node) (plump:parent node)
603 (plump:text new-text-node) (concatenate 'string
604 (plump:text new-text-node)
605 (plump:text node)
606 (if next-sibling
607 (plump:text next-sibling)
608 "")))
609 (plump:remove-child node)
610 (when next-sibling (plump:remove-child next-sibling))))
611 (scan-for-urls (text-node)
612 (declare (type plump:text-node text-node))
613 (let ((text (plump:text text-node)))
614 (multiple-value-bind (url-start url-end)
615 (ppcre:scan #'url-scanner text)
616 (declare (type simple-string text)
617 (type (or null fixnum) url-start url-end))
618 (when url-start
619 (let* ((url-raw (subseq text url-start url-end))
620 (url (if (mismatch "http" url-raw :end2 4) (concatenate 'string "http://" url-raw) url-raw))
621 (family (plump:family text-node))
622 (other-children (prog1
623 (subseq family (1+ (plump:child-position text-node)))
624 (setf (fill-pointer family) (1+ (plump:child-position text-node)))))
625 (new-a (plump:make-element (plump:parent text-node) "a"))
626 (new-text (unless (= url-end (length text)) (plump:make-text-node (plump:parent text-node) (subseq text url-end)))))
627 (setf (plump:text text-node) (subseq text 0 url-start)
628 (plump:attribute new-a "href") (with-direct-link (presentable-link url))
629 (plump:attribute new-a "class") "bare-url")
630 (plump:make-text-node new-a (clean-text url-raw))
631 (when new-text
632 (scan-for-urls new-text)
633 (setf (plump:text new-text) (clean-text (plump:text new-text))))
634 (loop for item across other-children
635 do (plump:append-child (plump:parent text-node) item))
636 (when (= url-start 0)
637 (plump:remove-child text-node)))))))
638 (title-to-anchor (text used-anchors)
639 ;; This should match LW behavior in packages/lesswrong/lib/collections/posts/tableOfContents.js
640 (let* ((chars-to-use "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789")
641 (base-anchor (with-output-to-string (stream)
642 (loop for c across text
643 do (write-char (if (find c chars-to-use) c #\_) stream)))))
644 (loop for suffix from 0
645 for anchor = base-anchor then (format nil "~A~A" base-anchor suffix)
646 when (not (gethash anchor used-anchors))
647 return (progn (setf (gethash anchor used-anchors) t)
648 anchor))))
649 (contents-to-html (contents min-header-level out-stream)
650 (declare (type cons contents))
651 (format out-stream "<nav class=\"contents\"><div class=\"contents-head\">Contents</div><ul class=\"contents-list\">")
652 (loop for (elem-level text id) in contents do
653 (format out-stream "<li class=\"toc-item-~A\"><a href=\"#~A\">~A</a></li>"
654 (- elem-level (- min-header-level 1)) id (clean-text-to-html text)))
655 (format out-stream "</ul></nav>"))
656 (style-hash-to-html (style-hash out-stream)
657 (declare (type hash-table style-hash))
658 (let ((style-list (alexandria:hash-table-keys style-hash)))
659 (if style-list
660 (format out-stream "<style>~{~A~}</style>" style-list)))))
661 (declare (ftype (function (plump:node &rest simple-string) boolean) only-child-is is-child-of-tag text-node-is-not))
662 (handler-bind
663 (((or plump:invalid-xml-character plump:discouraged-xml-character) #'abort))
664 (alexandria:if-let
665 (override (gethash post-id *html-overrides*))
666 (funcall override)
667 (let ((root (plump:parse (string-trim '(#\Space #\Newline #\Tab #\Return #\Linefeed #\Page) in-html)))
668 (contents nil)
669 (section-count 0)
670 (min-header-level 6)
671 (aggressive-deformat nil)
672 (style-hash (make-hash-table :test 'equal))
673 (used-colors (make-hash-table :test 'equal))
674 (used-anchors (make-hash-table :test 'equal)))
675 (declare (type fixnum section-count min-header-level))
676 (when *before-clean-hook*
677 (funcall *before-clean-hook*))
678 (let ((wayward-li-container nil))
679 (plump:traverse
680 root
681 (lambda (node)
682 (cond
683 ((not (plump:parent node)) nil)
684 ((tag-is node "a")
685 (cond
686 ((not (plump:attribute node "href"))
687 (move-children-out-of-node node :keep t))
688 ((and (ppcre:scan "^\s*https?://" (plump:text node))
689 (not (find #\HORIZONTAL_ELLIPSIS (plump:text node))))
690 (flatten-element node))
691 (t (tagbody start
692 (let* ((next-sibling (plump:next-sibling node))
693 (next-text-node (if (plump:text-node-p next-sibling) next-sibling))
694 (next-next-sibling (if next-text-node (plump:next-sibling next-text-node) next-sibling))
695 (next-a (if (and next-next-sibling (tag-is next-next-sibling "a")) next-next-sibling)))
696 (when (and next-a
697 (or (not next-text-node) (string-is-whitespace (plump:text next-text-node)))
698 (string= (plump:attribute node "href") (plump:attribute next-a "href")))
699 (when next-text-node
700 (plump:remove-child next-text-node)
701 (plump:append-child node next-text-node))
702 (loop for c across (plump:children next-a)
703 do (progn (plump:remove-child c)
704 (plump:append-child node c)))
705 (plump:remove-child next-a)
706 (go start)))))))
707 ((tag-is node "ul" "ol")
708 (setf wayward-li-container node)
709 (let ((new-children (plump:make-child-array)))
710 (loop for child across (plump:children node)
711 do (if (and (plump:element-p child) (tag-is child "li"))
712 (vector-push-extend child new-children)
713 (unless (and (plump:text-node-p child) (string-is-whitespace (plump:text child)))
714 (if (= (length new-children) 0)
715 (vector-push-extend (plump:make-element node "li") new-children))
716 (plump:append-child (aref new-children (- (length new-children) 1)) child))))
717 (setf (plump:children node) new-children)))
718 ((tag-is node "li")
719 (unless (is-child-of-tag node "ul" "ol")
720 (unless wayward-li-container
721 (setf wayward-li-container (make-element-before node "ul")))
722 (plump:remove-child node)
723 (plump:append-child wayward-li-container node)))
724 ((tag-is node "p" "blockquote" "div")
725 (setf wayward-li-container nil))))
726 :test #'plump:element-p))
727 (loop while (and (= 1 (length (plump:children root))) (plump:element-p (plump:first-child root)) (tag-is (plump:first-child root) "div" "html" "body"))
728 do (setf (plump:children root) (plump:children (plump:first-child root)))
729 do (loop for c across (plump:children root) do (setf (plump:parent c) root))
730 do (when-let (fc (plump:first-child root))
731 (when (and (plump:element-p fc) (tag-is fc "head"))
732 (loop for c across (plump:children fc) do
733 (when (and (plump:element-p c) (tag-is c "style"))
734 (setf (plump:parent c) (plump:parent fc))
735 (plump:insert-after fc c)))
736 (plump:remove-child fc))))
737 (loop for c across (plump:children root) do
738 (when (and (plump:element-p c)
739 (tag-is c "span")
740 (string-is-whitespace (plump:text c)))
741 (move-children-out-of-node c)))
742 (loop for lc = (plump:last-child root)
743 while (and (plump:element-p lc) (tag-is lc "br"))
744 do (plump:remove-child lc))
745 (plump:traverse
746 root
747 (lambda (node)
748 (when (and (plump:text-node-p node)
749 (plump:parent node)
750 (text-node-is-not node "a" "style" "pre"))
751 (scan-for-urls node))))
752 (plump:traverse
753 root
754 (lambda (node)
755 (when (and (not (plump:root-p node)) (plump:parent node))
756 (typecase node
757 (plump:text-node
758 (when (and (text-node-is-not node "style" "pre" "code")
759 (text-class-is-not node "mjx-math"))
760 (let ((new-root (plump:parse (clean-html-regexps (plump:serialize node nil))))
761 (other-children (prog1
762 (subseq (plump:family node) (1+ (plump:child-position node)))
763 (setf (fill-pointer (plump:family node)) (plump:child-position node)))))
764 (loop for item across (plump:children new-root)
765 do (plump:append-child (plump:parent node) item))
766 (loop for item across other-children
767 do (plump:append-child (plump:parent node) item)))))
768 (plump:element
769 (alexandria:when-let (style (plump:attribute node "style"))
770 (let ((style-list (style-string-to-alist style)))
771 (cond ((or aggressive-deformat
772 (cdr (assoc "font-family" style-list :test #'string-equal))
773 (search "font-style: inherit" style)
774 (search "MsoNormal" (plump:attribute node "class")))
775 (setf aggressive-deformat t)
776 (plump:remove-attribute node "style"))
777 ((ppcre:scan "(?:^|;)\\s*(?:line-height:[^;]+in)\\s*(?:;|$)" style)
778 (plump:remove-attribute node "style"))
780 (let (updated)
781 (iter (for style-item in style-list)
782 (when (member (car style-item) '("color" "background-color") :test #'string-equal)
783 (multiple-value-bind (r g b a) (decode-css-color (cdr style-item))
784 (when (and r g b a)
785 (let ((color-name (safe-color-name r g b a)))
786 (setf updated t
787 (gethash color-name used-colors) (list r g b a)
788 (cdr style-item) (format nil "var(--user-color-~A)" color-name)))))))
789 (when updated
790 (setf (plump:attribute node "style") (alist-to-style-string style-list))))))))
791 (when (and aggressive-deformat (tag-is node "div"))
792 (setf (plump:tag-name node) "p"))
793 (when (let ((class (plump:attribute node "class"))
794 (parent (plump:parent node)))
795 (and
796 (or (search "mjx-math" class)
797 (search "mjpage" class))
798 (and parent
799 (class-is-not parent "mjx-math" "mjpage"))))
800 (loop for current = node then (plump:parent current)
801 for parent = (plump:parent current)
802 when (loop for s across (plump:family current)
803 unless (or (eq s current)
804 (and (plump:text-node-p s) (string-is-whitespace (plump:text s))))
805 return t)
806 do (progn (add-class current "mathjax-inline-container")
807 (return))
808 when (or (plump:root-p parent)
809 (tag-is parent "p" "blockquote" "div"))
810 do (progn (add-class current "mathjax-block-container")
811 (return))))
812 (cond
813 ((tag-is node "a")
814 (vacuum-whitespace node)
815 (let ((href (plump:attribute node "href")))
816 (when href
817 (let* ((href (string-trim '(#\Space #\Newline #\Tab #\Return #\Linefeed #\Page) href))
818 (href (if (ppcre:scan "^(?:(?:[a-z]+:)?//|/|#)" href) href (format nil "http://~A" href)))
819 (href (or (with-direct-link (presentable-link href)) href)))
820 (when href
821 (setf (plump:attribute node "href") href)
822 (when *link-hook*
823 (log-and-ignore-errors
824 (funcall *link-hook* href))))))))
825 ((tag-is node "img")
826 (let ((width (ignore-errors (parse-integer (plump:attribute node "width"))))
827 (height (ignore-errors (parse-integer (plump:attribute node "height")))))
828 (if (and width height (<= width 1) (<= height 1))
829 ;; Remove probable tracking pixel.
830 (plump:remove-child node)
831 (let ((container
832 (if (and (tag-is (plump:parent node) "div" "p" "figure")
833 (only-child-is (plump:parent node) "img"))
834 (plump:parent node) ; Should already have imgonly class.
835 (let ((container (wrap-element node "div")))
836 (add-class container "imgonly")
837 container))))
838 (when-let ((src (plump:attribute node "src")))
839 (setf src (presentable-link src :image)
840 (plump:attribute node "src") src)
841 (create-dynamic-call container 'lw2.images::dynamic-image
842 src (plump:tag-name container) (alexandria:hash-table-alist (plump:attributes container))))
843 (when (and width height)
844 ;; Apply responsive image scaling CSS.
845 (setf (plump:attribute container "style") (format nil "--aspect-ratio: ~F; max-width: ~Dpx"
846 (/ (float width)
847 (float height))
848 width)))
849 (remove-attributes node "style" "class" "width" "height")))))
850 ((tag-is node "figure")
851 (remove-attributes node "style" "class" "width" "height"))
852 ((and (tag-is node "p") (only-child-is node "figure"))
853 (move-children-out-of-node node))
854 ((tag-is node "p" "blockquote" "div" "center")
855 (when (only-child-is node "center")
856 (unless (string-is-whitespace (plump:text node))
857 (add-element-style node "text-align" "center"))
858 (move-children-out-of-node (plump:first-child node)))
859 (when (tag-is node "center")
860 (setf (plump:tag-name node) "p")
861 (add-element-style node "text-align" "center"))
862 (when-let ((question-id (plump:attribute node "data-elicit-id")))
863 (when-let* ((question-title (lw2.backend::get-elicit-question-title question-id))
864 (links (plump:get-elements-by-tag-name node "a"))
865 (text-node (plump:first-child (first links))))
866 (setf (plump:text text-node) question-title))
867 (create-dynamic-call (plump:parent (plump:parent node)) 'lw2.elicit-predictions::render-elicit-block question-id))
868 (if (string-is-whitespace (plump:text node))
869 (if (or (plump:get-elements-by-tag-name node "img")
870 (plump:get-elements-by-tag-name node "iframe"))
871 (add-class node "imgonly")
872 (plump:remove-child node))
873 (if-let (parent (plump:parent node))
874 (labels ((spoilerp (n)
875 (if-let (a (and (plump:element-p n) (plump:attribute n "class")))
876 (ppcre:scan "(?:^| )spoiler\\S*(?: |$)" a))))
877 (when (and nil (tag-is node "p") ;; FIXME: disabled until we can fix math and code false positives
878 (rot13-text-p (plump:text node)))
879 (setf (plump:attribute node "class") "spoiler")
880 (plump:traverse node
881 (lambda (n) (unrot13-by-words (plump:text n)))
882 :test #'plump:text-node-p))
883 (cond
884 ((and (tag-is node "p")
885 (spoilerp node)
886 (spoilerp parent))
887 (plump:remove-attribute node "class"))
888 ((and (tag-is node "div")
889 (spoilerp node))
890 (setf (plump:attribute node "class") "spoiler"))
891 ((and (spoilerp node)
892 (tag-is node "p")
893 (not (spoilerp parent)))
894 (let ((previous-sibling (plump:previous-sibling node)))
895 (if (and previous-sibling (spoilerp previous-sibling))
896 (progn (plump:remove-child node)
897 (plump:append-child previous-sibling node)
898 (plump:remove-attribute node "class"))
899 (let ((new-container (plump:make-element parent "div")))
900 (setf (plump:attribute new-container "class") "spoiler")
901 (plump:remove-child new-container)
902 (setf (plump:parent new-container) (plump:parent node))
903 (plump:insert-before node new-container)
904 (loop for e = node then ns
905 while (and (plump:element-p e) (spoilerp e))
906 for ns = (plump:next-sibling e)
907 do (progn
908 (plump:remove-attribute e "class")
909 (plump:remove-child e)
910 (plump:append-child new-container e))))))))))))
911 ((tag-is node "table" "tbody" "tr" "td")
912 (remove-style-rules node "border-top" "border-bottom" "border-left" "border-right" "padding")
913 (when (and (tag-is node "td") (is-numeric node))
914 (add-class node "numeric")))
915 ((tag-is node "u")
916 (let ((parent (plump:parent node)))
917 (cond
918 ((and (or (plump:root-p parent) (and (plump:element-p parent) (tag-is parent "p" "blockquote" "div")))
919 (loop for c across (plump:children node) never (and (plump:element-p c) (tag-is c "a"))))
920 (vacuum-whitespace node))
922 (move-children-out-of-node node)))))
923 ((tag-is node "ol")
924 (when-let (old-style (plump:attribute node "style"))
925 (setf (plump:attribute node "style")
926 (ppcre:regex-replace-all
927 (load-time-value (ppcre:create-scanner "list-style-type\\s*:\\s*decimal\\s*;?" :single-line-mode t :case-insensitive-mode t))
928 old-style
929 "")))
930 (when-let (start-string (plump:attribute node "start"))
931 (when-let (start (ignore-errors (parse-integer start-string)))
932 (plump:remove-attribute node "start")
933 (add-element-style node "counter-reset" (format nil "ol ~A" (- start 1))))))
934 ((tag-is node "li")
935 (when (let ((c (plump:first-child node))) (and c (if (plump:text-node-p c) (not (string-is-whitespace (plump:text c))) (not (tag-is c "p" "ul" "ol")))))
936 (wrap-children node "p")))
937 ((tag-is node "pre")
938 (let ((nchildren (length (plump:children node))))
939 (when (>= nchildren 1)
940 (remove-if-whitespace (plump:first-child node))
941 (when (>= nchildren 2)
942 (remove-if-whitespace (plump:last-child node))))))
943 ((ppcre:scan "^h[1-6]$" (plump:tag-name node))
944 (when (plump:get-elements-by-tag-name node "p")
945 (move-children-out-of-node node))
946 (cond
947 ((string-is-whitespace (plump:text node))
948 (plump:remove-child node))
950 (let ((fc (plump:first-child node))
951 (lc (plump:last-child node)))
952 (when (and (plump:element-p fc) (tag-is fc "br")) (plump:remove-child fc))
953 (when (and (not (eql fc lc)) (plump:element-p lc) (tag-is lc "br")) (plump:remove-child lc)))
954 (when with-toc
955 (incf section-count)
956 (unless (plump:attribute node "id") (setf (plump:attribute node "id") (format nil "section-~A" section-count)))
957 (let* ((header-level (parse-integer (subseq (plump:tag-name node) 1)))
958 (header-text (with-output-to-string (stream)
959 (plump:traverse node
960 (lambda (n)
961 (typecase n
962 (plump:text-node
963 (when (text-node-is-not n "style" "script")
964 (write-string (plump:text n) stream))))))))
965 (anchor-old (or (plump:attribute node "id") (format nil "section-~A" section-count)))
966 (anchor-new (title-to-anchor header-text used-anchors))
967 (wrapper (wrap-children node "span")))
968 (setf min-header-level (min min-header-level header-level)
969 (plump:attribute node "id") anchor-new
970 (plump:attribute wrapper "id") anchor-old)
971 (push (list header-level
972 header-text
973 anchor-new)
974 contents))))))
975 ((tag-is node "style")
976 (let ((text (plump:text node)))
977 (when (search ".mjx-math" text)
978 (setf (gethash text style-hash) t)))
979 (plump:remove-child node))
980 ((tag-is node "script")
981 (plump:remove-child node))))))))
982 (clean-dom-text root)
983 (let ((with-toc (>= section-count 3))
984 (out-string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t)))
985 (with-output-to-string (out-stream out-string)
986 (style-hash-to-html style-hash out-stream)
987 (when (> (hash-table-count used-colors) 0)
988 (format out-stream "<style>~%:root {~%")
989 (maphash (lambda (name rgba-list)
990 (declare (ignore rgba-list))
991 (format out-stream " --user-color-~A: #~A;~%" name name))
992 used-colors)
993 (flet ((write-inverted-colors (theme)
994 (format out-stream "body.theme-~A {~%" theme)
995 (maphash (lambda (name rgba-list)
996 (format out-stream " --user-color-~A: ~A;~%" name
997 (multiple-value-call #'encode-css-color (apply #'perceptual-invert-rgba rgba-list))))
998 used-colors)
999 (format out-stream "}~%")))
1000 (format out-stream "}~%@media (prefers-color-scheme: dark) {~%")
1001 (write-inverted-colors "default")
1002 (format out-stream "}~%")
1003 (write-inverted-colors "dark"))
1004 (format out-stream "</style>"))
1005 (with-dynamic-block-serialization (current-memo-hash out-string)
1006 (loop for c across (plump:children root)
1007 when (and with-toc
1008 (not (or (string-is-whitespace (plump:text c))
1009 (tag-is c "figure"))))
1010 do (progn
1011 (contents-to-html (nreverse contents) min-header-level out-stream)
1012 (setf with-toc nil))
1013 do (plump:serialize c out-stream)))
1014 out-string)))))))