Fix bug with spoiler tags not appearing.
[lw2-viewer.git] / src / clean-html.lisp
blob1ec529fcf1e49c4f6eebcdac50cce2a3af9b26c1
1 (uiop:define-package #:lw2.clean-html
2 (:use #:cl #:alexandria #:iterate #:split-sequence #:lw2.lmdb #:lw2.links #:lw2.utils #:lw2.context #:lw2.sites #:lw2.conditions #:lw2.colors)
3 (:export #:*before-clean-hook* #:*link-hook* #:url-scanner #:clean-text #:clean-text-to-html #:title-to-anchor #:contents-to-html #:clean-html #:clean-html* #:extract-excerpt #:extract-excerpt*)
4 (:unintern #:*text-clean-regexps* #:*html-clean-regexps*))
6 (in-package #:lw2.clean-html)
8 (setf cl-typesetting-hyphen::*hyphen-patterns-directory* (asdf:system-relative-pathname "lw2-viewer" "data/hyphenation-patterns/"))
9 (setf cl-typesetting-hyphen::*language-hyphen-file-list* '((:en-us . "hyph_en_US")))
10 (cl-typesetting-hyphen:load-language :en-us)
11 (setf cl-typesetting::*default-hyphen-language* :en-us)
13 (defvar *before-clean-hook* nil)
14 (defvar *link-hook* nil)
16 (defun file-get-contents (filename)
17 (with-open-file (stream filename)
18 (uiop:slurp-stream-string stream)))
20 (defun grab-from-rts (url)
21 (declare (optimize (speed 0) (space 3)))
22 (let* ((root (plump:parse (dex:get url :keep-alive nil)))
23 (post-body (plump:get-element-by-id root "wikitext")))
24 (loop for cls in '("div.nav_menu" "div.imgonly" "div.bottom_nav") do
25 (loop for e across (clss:select cls post-body)
26 do (plump:remove-child e)))
27 (plump:remove-child (elt (clss:select "h1" post-body) 0))
28 (plump:remove-child (elt (clss:select "p" post-body) 0))
29 (with-open-file (stream (merge-pathnames "./rts-content/" (subseq (quri:uri-path (quri:uri url)) 1)) :direction :output :if-does-not-exist :create :external-format :utf-8)
30 (plump:serialize post-body stream))))
32 (defun rts-to-html (file)
33 (declare (optimize (speed 0) (space 3)))
34 (concatenate 'string
35 "<style>"
36 (file-get-contents "./rts-content/rts.css")
37 "</style>"
38 (file-get-contents (merge-pathnames "./rts-content/" file))))
40 (defparameter *html-overrides* (make-hash-table :test 'equal))
41 (loop for (id file) in '(("XTXWPQSEgoMkAupKt" "An-Intuitive-Explanation-Of-Bayess-Theorem")
42 ("afmj8TKAqH6F2QMfZ" "A-Technical-Explanation-Of-Technical-Explanation")
43 ("7ZqGiPHTpiDMwqMN2" "The-Twelve-Virtues-Of-Rationality"))
44 do (let ((file* file)) (setf (gethash id *html-overrides*) (lambda () (rts-to-html file*)))))
46 (defmacro do-with-cleaners ((regexp-list scanner replacement) &body body)
47 (declare (optimize (speed 0) (space 3)))
48 `(labels ((fn (,scanner ,replacement) ,@body))
49 ,@(loop for (regex flags replacement) in (eval regexp-list)
50 collecting `(fn (load-time-value
51 (ppcre:create-scanner ,regex
52 ,@(loop for (flag sym) in '((#\i :case-insensitive-mode)
53 (#\m :multi-line-mode)
54 (#\s :single-line-mode)
55 (#\x :extended-mode))
56 when (find flag flags)
57 append (list sym t))))
58 ,replacement))))
60 (defmacro define-cleaner (name regexp-list)
61 (declare (optimize (speed 0) (space 3)))
62 `(defun ,name (text)
63 (do-with-cleaners (,regexp-list scanner replacement)
64 (setf text (ppcre:regex-replace-all scanner text replacement)))))
66 (eval-when (:compile-toplevel :load-toplevel :execute)
67 (defun read-regexp-file (filename)
68 (declare (optimize (speed 0) (space 3)))
69 (let ((data (destructuring-bind (* ((* (* inner))))
70 (with-open-file (stream (uiop:subpathname (asdf:system-source-directory "lw2-viewer") filename)) (parse-js:parse-js stream))
71 inner)))
72 (loop for input in data
73 collecting (destructuring-bind (* ((* regex flags) (* replacement))) input
74 (list regex flags (ppcre:regex-replace-all "\\$(\\d)" replacement "\\\\\\1")))))))
76 (define-cleaner clean-text (read-regexp-file "text-clean-regexps.js"))
77 (define-cleaner clean-html-regexps (read-regexp-file "html-clean-regexps.js"))
79 (declaim (ftype function url-scanner))
80 (eval-when (:compile-toplevel :load-toplevel :execute)
81 (setf (fdefinition 'url-scanner) (ppcre:create-scanner
82 "(?:https?://[-a-zA-Z0-9]+\\.[-a-zA-Z0-9.]+|[-a-zA-Z0-9.]+\\.(?:com|edu|gov|mil|net|org|int|biz|info|name|museum|us|ca|uk|io|ly))(?:\\:[0-9]+){0,1}(?:(?:/|\\?(?!(?:$|\\s)))(?:(?:(\\()|\\)(?![.,;:?!]?(?:$|\\s))|[-\\w\\d.,;:?'\\\\+@!&%$#=~–_/])*(?(1)[-\\w\\d\\\\+@&%$#=~_/)]|[-\\w\\d\\\\+@&%$#=~_/]))?)?"
83 :single-line-mode t)))
85 (defun hyphenate-string (string)
86 (let ((hyphenation-list (cl-typesetting::hyphenate-string string)))
87 (declare (type string string)
88 (type list hyphenation-list))
89 (if hyphenation-list
90 (let ((new-string (make-array (+ (length string) (length hyphenation-list)) :element-type 'character :fill-pointer 0)))
91 (loop for char across string
92 for orig-offset of-type fixnum from 0
93 with current-hyphenation = hyphenation-list
94 do (when (and current-hyphenation (= orig-offset (the fixnum (first current-hyphenation))))
95 (vector-push #\SOFT_HYPHEN new-string)
96 (setf current-hyphenation (rest current-hyphenation)))
97 do (vector-push char new-string))
98 (values (coerce new-string '(simple-array character 1)) hyphenation-list))
99 (values string nil))))
101 (defun clean-text-to-html (text &key (hyphenation t))
102 (handler-bind
103 (((or plump:invalid-xml-character plump:discouraged-xml-character) #'abort))
104 (clean-html-regexps
105 (plump:encode-entities
106 (coerce
107 (funcall (if hyphenation #'hyphenate-string #'identity) (clean-text text))
108 'simple-string)))))
110 (defun rot13-char (char)
111 (let ((char-code (char-code char)))
112 (labels ((translate-char (base)
113 (code-char (+ base (mod (+ 13 (- char-code base)) 26)))))
114 (declare (dynamic-extent #'translate-char))
115 (cond
116 ((<= (char-code #\A) char-code (char-code #\Z))
117 (translate-char (char-code #\A)))
118 ((<= (char-code #\a) char-code (char-code #\z))
119 (translate-char (char-code #\a)))
120 (t char)))))
122 (declaim (inline letter-index))
123 (defun letter-index (char)
124 (let ((char-code (char-code char)))
125 (cond
126 ((<= (char-code #\A) char-code (char-code #\Z))
127 (- char-code (char-code #\A)))
128 ((<= (char-code #\a) char-code (char-code #\z))
129 (- char-code (char-code #\a))))))
131 (defparameter *letter-frequencies*
132 '((#\A . 8.167) (#\B . 1.492) (#\C . 2.782) (#\D . 4.253) (#\E . 12.702)
133 (#\F . 2.228) (#\G . 2.015) (#\H . 6.094) (#\I . 6.966) (#\J . 0.153)
134 (#\K . 0.772) (#\L . 4.025) (#\M . 2.406) (#\N . 6.749) (#\O . 7.507)
135 (#\P . 1.929) (#\Q . 0.095) (#\R . 5.987) (#\S . 6.327) (#\T . 9.056)
136 (#\U . 2.758) (#\V . 0.978) (#\W . 2.36) (#\X . 0.15) (#\Y . 1.974)
137 (#\Z . 0.074)))
139 (defparameter *letter-rot13-log-odds*
140 (let ((array (make-array 26 :element-type 'single-float)))
141 (loop
142 for letter-freq in *letter-frequencies*
143 for rot13-freq in (concatenate 'list (subseq *letter-frequencies* 13 26) (subseq *letter-frequencies* 0 13))
144 for i from 0
145 do (setf (aref array i) (log (/ (cdr letter-freq) (cdr rot13-freq)) 2)))
146 array))
148 (defparameter *bigram-rot13-log-odds*
149 (let ((bigram-data (with-open-file (stream (asdf:system-relative-pathname "lw2-viewer" "data/bigrams.lisp"))
150 (read stream)))
151 (bigram-table (make-array '(26 26) :element-type 'single-float :initial-element 1.0))
152 (bigram-log-odds (make-array '(26 26) :element-type 'single-float)))
153 (loop
154 for (bigram count) in bigram-data
155 do (setf (aref bigram-table
156 (letter-index (aref bigram 0))
157 (letter-index (aref bigram 1)))
158 (float (+ count 1))))
159 (dotimes (i 26)
160 (dotimes (j 26)
161 (setf (aref bigram-log-odds i j)
162 (log (/ (aref bigram-table i j)
163 (aref bigram-table
164 (mod (+ 13 i) 26)
165 (mod (+ 13 j) 26)))
166 2))))
167 bigram-log-odds))
169 (defun rot13-text-p (text &optional (start 0) (end (length text)))
170 (declare (type simple-string text))
171 (let ((log-odds
172 (cond
173 ((= 0 (- end start))
174 0.0)
175 ((= 1 (- end start))
176 (let ((odds-table (load-time-value *letter-rot13-log-odds*)))
177 (declare (type (simple-array single-float (26)) odds-table))
178 (if-let ((letter-index (letter-index (aref text start))))
179 (aref odds-table letter-index)
180 0.0)))
182 (let ((odds-table (load-time-value *bigram-rot13-log-odds*)))
183 (declare (type (simple-array single-float (26 26)) odds-table))
184 (loop
185 with sum = 0.0
186 for i from (1+ start) to (1- end)
187 do (when-let ((a (letter-index (aref text (1- i))))
188 (b (letter-index (aref text i))))
189 (setf sum (+ sum (aref odds-table a b))))
190 finally (return sum)))))))
191 (values
192 (< log-odds (log 0.0001 2))
193 log-odds)))
195 (defun rot13-inplace (text &optional (start 0) (end (1- (length text))))
196 (declare (type simple-string text))
197 (loop
198 for i from start to end
199 do (setf (aref text i) (rot13-char (aref text i))))
200 text)
202 (defun unrot13-by-words (text)
203 (declare (type simple-string text))
204 (loop
205 with word-start = 0
206 for i from 0
207 for char across text
208 when (or (= i (1- (length text)))
209 (position char ",.:;?! "))
210 do (progn
211 (when (< (nth-value 1 (rot13-text-p text word-start i)) (+ 0 (- i word-start 0)))
212 (rot13-inplace text word-start i))
213 (setf word-start i)))
214 text)
216 (defun char-is-whitespace (c)
217 (or (cl-unicode:has-binary-property c "White_Space")
218 (eql c #\BRAILLE_PATTERN_BLANK)))
220 (defun string-is-whitespace (string)
221 (every #'char-is-whitespace string))
223 (defvar *mathjax-process* nil)
225 (defmacro with-mathjax-processor (() &body body)
226 `(let ((*mathjax-process* nil))
227 (unwind-protect
228 (progn ,@body)
229 (when *mathjax-process*
230 (uiop:close-streams *mathjax-process*)
231 (uiop:wait-process *mathjax-process*)))))
233 (defun start-mathjax-process ()
234 (unless *mathjax-process*
235 (setf *mathjax-process* (uiop:launch-program "node js-foreign-lib/mathjax.js"
236 :input :stream :output :stream
237 :element-type '(unsigned-byte 8))))
238 (values (uiop:process-info-input *mathjax-process*)
239 (uiop:process-info-output *mathjax-process*)))
241 (defun process-mathjax (tex)
242 (multiple-value-bind (outstream instream) (start-mathjax-process)
243 (let ((outbuf (babel:string-to-octets tex)))
244 (nibbles:write-ub32/le (length outbuf) outstream)
245 (write-sequence outbuf outstream)
246 (finish-output outstream))
247 (let ((insize (nibbles:read-ub32/le instream)))
248 (when (> insize 0)
249 (let ((inbuf (make-array insize :element-type '(unsigned-byte 8))))
250 (read-sequence inbuf instream)
251 (babel:octets-to-string inbuf))))))
253 (defun handle-codecogs (node uri)
254 (regex-case uri
255 ("^https?://[a-z.]*codecogs\.com/[a-z.]*\\?(.*)$"
256 (declare (regex-groups-min 1))
257 (let ((math-html (process-mathjax (quri:url-decode (reg 0)))))
258 (when math-html
259 (let* ((math-dom (plump:parse math-html))
260 (children (plump:children math-dom)))
261 (if (string-is-whitespace (plump:text (plump:parent node)))
262 (loop for child across children
263 do (plump:insert-after node child)
264 do (setf (plump:parent child) (plump:parent node)))
265 (let ((container (plump:make-element (plump:parent node) "span")))
266 (plump:remove-child container)
267 (plump:insert-after node container)
268 (setf (plump:parent container) (plump:parent node)
269 (plump:children container) children)
270 (loop for child across children
271 do (setf (plump:parent child) container))))
272 (plump:remove-child node))
273 t)))
274 (t nil)))
276 (declaim (ftype (function (plump:node) list) class-list))
278 (defun class-list (node)
279 (let ((class (plump:attribute node "class")))
280 (and class (split-sequence #\Space class))))
282 (declaim (ftype (function (plump:node &rest simple-string) boolean) tag-is class-is-not text-class-is-not))
284 (defun tag-is (node &rest args)
285 (declare (type plump:node node)
286 (dynamic-extent args))
287 (when (plump:element-p node)
288 (let ((tag (plump:tag-name node)))
289 (to-boolean
290 (some (lambda (x) (string= tag x))
291 args)))))
293 (defun every-ancestor (node test)
294 (declare (type plump:node node)
295 (type function test))
296 (iter (for target first node then (plump:parent target))
297 (cond ((or (null target) (plump:root-p target)) (return t))
298 ((not (funcall test target)) (return nil)))))
300 (defun any-ancestor (node test)
301 (not (every-ancestor node (complement test))))
303 (defun class-is-not (node &rest args)
304 (declare (type plump:node node)
305 (dynamic-extent args))
306 (every-ancestor node (lambda (n)
307 (not (intersection
308 (class-list n)
309 args
310 :test #'string=)))))
312 (defun class-is (node &rest args)
313 (declare (type plump:node node)
314 (dynamic-extent args))
315 (any-ancestor node (lambda (n)
316 (intersection
317 (class-list n)
318 args
319 :test #'string=))))
321 (defun text-class-is-not (node &rest args)
322 (declare (type plump:node node)
323 (dynamic-extent args))
324 (apply #'class-is-not (plump:parent node) args))
326 (defun clean-dom-text (root)
327 (handler-bind
328 (((or plump:invalid-xml-character plump:discouraged-xml-character) #'abort))
329 (labels
330 ((recursep (node)
331 (and (plump:element-p node)
332 (ppcre:scan "^(?:p|div|blockquote|li|h[0-6])$" (plump:tag-name node))))
333 (cleanablep (node)
334 (and (plump:text-node-p node)
335 (plump:parent node)
336 (text-class-is-not node "mjx-math" "arbital-math")
337 (not (tag-is (plump:parent node) "code"))))
338 (traverse (node main-fn &optional recurse-fn)
339 (when (cleanablep node) (funcall main-fn node))
340 (when (plump:nesting-node-p node)
341 (loop for n across (plump:children node)
342 do (if (recursep n)
343 (if recurse-fn (funcall recurse-fn n))
344 (traverse n main-fn recurse-fn))))))
345 (let* ((offset-list nil)
346 (whole-string-input
347 (with-output-to-string (stream)
348 (traverse
349 root
350 (lambda (node)
351 (push (length (the string (plump:text node))) offset-list)
352 (write-string (plump:text node) stream))
353 #'clean-dom-text)))
354 (whole-string-output whole-string-input))
355 (declare (type string whole-string-output whole-string-input))
356 (setf offset-list (nreverse offset-list))
357 (labels
358 ((call-with-offset-loop (continue-fn loop-fn next-offset-fn offset-adjust-fn)
359 (loop with current-offset = offset-list
360 with output-offset of-type (or null fixnum) = (first current-offset)
361 with output-offset-list = nil
362 with total-offset of-type fixnum = 0
363 while (funcall continue-fn)
364 do (funcall loop-fn)
365 do (loop for current-offset-num of-type fixnum = (first current-offset)
366 while (and (rest current-offset) (< (+ total-offset current-offset-num) (funcall next-offset-fn)))
367 do (progn
368 (push output-offset output-offset-list)
369 (setf total-offset (+ total-offset current-offset-num)
370 current-offset (cdr current-offset)
371 output-offset (first current-offset))))
372 do (setf output-offset (funcall offset-adjust-fn output-offset))
373 finally (progn
374 (push output-offset output-offset-list)
375 (loop for x in (rest current-offset) do (push x output-offset-list))
376 (setf offset-list (nreverse output-offset-list))))
377 (values)))
378 (declare (dynamic-extent (function call-with-offset-loop))
379 (ftype (function ((function ()) (function ()) (function () fixnum) (function (fixnum) fixnum)) (values)) call-with-offset-loop))
380 (macrolet
381 ((offset-loop ((list-binding list-form) (&body loop-body) (&body next-offset-body) (&body offset-adjust-body))
382 (with-gensyms (list-current)
383 `(let ((,list-current ,list-form)
384 (,list-binding))
385 (labels ((continue-fn () (if ,list-current (setf ,list-binding (pop ,list-current))))
386 (loop-fn () ,.loop-body)
387 (next-offset-fn () ,.next-offset-body)
388 (offset-adjust-fn ,(first offset-adjust-body) (declare (type fixnum ,(caar offset-adjust-body)) (values fixnum)) ,.(rest offset-adjust-body)))
389 (declare (dynamic-extent (function continue-fn) (function loop-fn) (function next-offset-fn) (function offset-adjust-fn)))
390 (call-with-offset-loop #'continue-fn #'loop-fn #'next-offset-fn #'offset-adjust-fn))))))
391 (do-with-cleaners ((read-regexp-file "text-clean-regexps.js") scanner replacement)
392 (let ((replacements 0)
393 (replacement-list nil)
394 (original-length (length whole-string-output)))
395 (declare (type fixnum replacements))
396 (ppcre:do-scans (match-start match-end reg-starts reg-ends scanner whole-string-output)
397 (declare (type fixnum match-start match-end)
398 (type simple-vector reg-starts reg-ends))
399 (incf replacements)
400 (push
401 (list (if (and (> (length reg-starts) 0) (eq (aref reg-starts 0) match-start))
402 (aref reg-ends 0)
403 match-start)
404 (if (and (> (length reg-starts) 0) (eq (aref reg-ends (- (length reg-ends) 1)) match-end))
405 (aref reg-starts (- (length reg-starts) 1))
406 match-end))
407 replacement-list))
408 (setf replacement-list (nreverse replacement-list))
409 (setf whole-string-output (ppcre:regex-replace-all scanner whole-string-output replacement))
410 (let ((length-difference (- (length whole-string-output) original-length))
411 (length-change 0))
412 (declare (type fixnum length-difference length-change))
413 (offset-loop
414 (current-replacement replacement-list)
415 ((setf length-change (ceiling length-difference replacements)
416 length-difference (- length-difference length-change)
417 replacements (- replacements 1)))
418 ((destructuring-bind (start end) current-replacement
419 (declare (type fixnum start end))
420 (ceiling (+ start end) 2)))
421 ((output-offset) (max 0 (+ output-offset length-change)))))))
422 (multiple-value-bind (hyphenated-string hyphenation-list) (hyphenate-string whole-string-output)
423 (setf whole-string-output hyphenated-string)
424 (offset-loop
425 (current-hyphenation hyphenation-list)
427 (current-hyphenation)
428 ((output-offset) (1+ output-offset))))))
429 (let ((current-offset 0))
430 (declare (type (or null fixnum) current-offset))
431 (traverse
432 root
433 (lambda (node)
434 (let ((output-length (length whole-string-output))
435 (next-offset (if offset-list (+ current-offset (the fixnum (first offset-list))) nil)))
436 (declare (type (or null fixnum) next-offset))
437 (setf (plump:text node) (subseq whole-string-output (min current-offset output-length) (and next-offset (min next-offset output-length)))
438 current-offset next-offset
439 offset-list (cdr offset-list))))
440 (lambda (node) (declare (ignore node))))))))
441 root)
443 ;;;; Dynamic content blocks.
445 (define-cache-database 'lw2.backend-modules:backend-lmdb-cache "dynamic-content-blocks")
447 (defparameter *dynamic-content-block-callback* nil)
449 (defclass dynamic-element (plump:element)
450 ((dynamic-call-form :initarg :dynamic-call-form :accessor dynamic-call-form)))
452 (defmethod plump:serialize-object :around ((node dynamic-element))
453 (funcall *dynamic-content-block-callback* :start node)
454 (call-next-method)
455 (funcall *dynamic-content-block-callback* :end node))
457 (defun create-dynamic-call (node function-name &rest args)
458 (change-class node 'dynamic-element :dynamic-call-form (list* function-name args)))
460 (defun call-with-dynamic-block-serialization (fn hash output-string)
461 (let* ((current-start nil)
462 (last-string-pos 0)
463 (last-octet-pos 0))
464 (flet ((current-octet-pos ()
465 (psetf last-string-pos (length output-string)
466 last-octet-pos (+ last-octet-pos (babel:string-size-in-octets output-string :start last-string-pos)))
467 last-octet-pos))
468 (declare (dynamic-extent #'current-octet-pos))
469 (let* ((dynamic-call-list nil)
470 (*dynamic-content-block-callback*
471 (lambda (phase node)
472 (case phase
473 (:start (setf current-start (current-octet-pos)))
474 (:end (let ((current-end (current-octet-pos)))
475 (push (list* current-start current-end (dynamic-call-form node))
476 dynamic-call-list)))))))
477 (declare (dynamic-extent *dynamic-content-block-callback*))
478 (multiple-value-prog1
479 (funcall fn)
480 (if dynamic-call-list
481 (cache-put "dynamic-content-blocks" hash (nreverse dynamic-call-list) :key-type :byte-vector :value-type :lisp)
482 (cache-del "dynamic-content-blocks" hash :key-type :byte-vector)))))))
484 (defmacro with-dynamic-block-serialization ((hash output-string) &body body)
485 `(dynamic-flet ((fn () ,@body)) (call-with-dynamic-block-serialization #'fn ,hash ,output-string)))
487 ;;;;
489 (define-lmdb-memoized extract-excerpt 'lw2.backend-modules:backend-lmdb-cache
490 (:sources ("src/clean-html.lisp")) (in-html)
491 (let ((root (plump:parse (string-trim '(#\Space #\Newline #\Tab #\Return #\Linefeed #\Page) in-html)))
492 (chars 0)
493 (need-space nil))
494 (with-output-to-string (out-stream)
495 (block nil
496 (plump:traverse
497 root
498 (lambda (node)
499 (when (or (> (length (plump:children node)) 1)
500 (plump:text-node-p (plump:first-child node)))
501 (let ((text (plump:text node)))
502 (when need-space
503 (write-char #\Space out-stream))
504 (write-string text out-stream)
505 (setf chars (+ chars (length text))
506 need-space t)
507 (when (> chars 480)
508 (return nil)))))
509 :test (lambda (node) (tag-is node "p")))))))
511 (defun title-to-anchor (text used-anchors)
512 ;; This should match LW behavior in packages/lesswrong/lib/collections/posts/tableOfContents.js
513 (let* ((chars-to-use "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789")
514 (base-anchor (with-output-to-string (stream)
515 (loop for c across text
516 do (write-char (if (find c chars-to-use) c #\_) stream)))))
517 (loop for suffix from 0
518 for anchor = base-anchor then (format nil "~A~A" base-anchor suffix)
519 when (not (gethash anchor used-anchors))
520 return (progn (setf (gethash anchor used-anchors) t)
521 anchor))))
523 (defun contents-to-html (contents min-header-level out-stream)
524 (declare (type cons contents))
525 (format out-stream "<nav class=\"contents\"><script>injectTOCCollapseToggleButton()</script><div class=\"contents-head\">Contents</div><ul class=\"contents-list\">")
526 (loop for (elem-level text id) in contents do
527 (let* #.(loop for regex in '("^[0-9]+\\. "
528 "^[0-9]+: "
529 "(?i)^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\. "
530 "^[A-Z]\. ")
531 collect `(text (ppcre:regex-replace ,regex text "")))
532 (format out-stream "<li class=\"toc-item-~A\"><a href=\"#~A\">~A</a></li>"
533 (- elem-level (- min-header-level 1)) id (clean-text-to-html text))))
534 (format out-stream "</ul></nav>"))
536 (define-lmdb-memoized clean-html 'lw2.backend-modules:backend-lmdb-cache
537 (:sources ("src/clean-html.lisp" "src/links.lisp" "src/colors.lisp" "text-clean-regexps.js" "html-clean-regexps.js" "js-foreign-lib/mathjax.js")) (in-html &key with-toc post-id)
538 (declare (ftype (function (plump:node) fixnum) plump:child-position)
539 (ftype (function (plump:node) (and vector (not simple-array))) plump:family)
540 (ftype (function (plump:node) simple-string) plump:text plump:tag-name))
541 (labels ((only-child-is (node &rest args)
542 (declare (dynamic-extent args))
543 (and (= 1 (length (plump:children node)))
544 (let ((child (plump:first-child node)))
545 (and
546 (plump:element-p child)
547 (apply #'tag-is child args)))))
548 (is-child-of-tag (node &rest args)
549 (declare (dynamic-extent args))
550 (loop for e = (plump:parent node) then (plump:parent e)
551 while (not (typep e 'plump:root))
552 when (and (plump:element-p e) (apply #'tag-is (cons e args))) return t))
553 (add-class (node class)
554 (declare (type plump:node node)
555 (type string class))
556 (let ((classes (adjoin class (alexandria:if-let (attr (plump:attribute node "class")) (split-sequence #\Space attr)) :test #'string=)))
557 (declare (dynamic-extent classes))
558 (setf (plump:attribute node "class") (format nil "~{~A~^ ~}" classes)))
559 node)
560 (remove-attributes (node &rest attrs)
561 (declare (dynamic-extent attrs))
562 (dolist (attr attrs)
563 (plump:remove-attribute node attr)))
564 (make-element-before (node tag)
565 (if (plump:text-node-p node)
566 (make-element-before (plump:parent node) tag)
567 (let ((e (plump:make-element (plump:parent node) tag)))
568 (plump:remove-child e)
569 (plump:insert-before node e)
570 (setf (plump:parent e) (plump:parent node))
571 e)))
572 (wrap-element (node element-name)
573 (let ((container (make-element-before node element-name)))
574 (plump:remove-child node)
575 (plump:append-child container node)
576 container))
577 (wrap-children (node element-name)
578 (let ((new-element (plump:make-element node element-name)))
579 (plump:remove-child new-element)
580 (setf (plump:children new-element) (plump:clone-children node t new-element)
581 (plump:children node) (plump:make-child-array))
582 (plump:append-child node new-element)))
583 (move-children-out-of-node (node &key keep)
584 (iterate (for c in-vector (plump:children node) downto 0)
585 (setf (plump:parent c) (plump:parent node))
586 (plump:insert-after node c))
587 (if keep
588 (setf (plump:children node) (plump:make-child-array))
589 (plump:remove-child node)))
590 (text-node-is-not (node &rest args)
591 (declare (type plump:node node)
592 (dynamic-extent args))
594 (typep (plump:parent node) 'plump:root)
595 (every (lambda (x) (string/= (plump:tag-name (plump:parent node)) x)) args)))
596 (adjacent-text-node (node direction)
597 (multiple-value-bind (get-sibling insert-sibling)
598 (ecase direction
599 (:previous (values #'plump:previous-sibling #'plump:insert-before))
600 (:next (values #'plump:next-sibling #'plump:insert-after)))
601 (let ((candidate (funcall get-sibling node)))
602 (if (plump:text-node-p candidate)
603 candidate
604 (let ((new-node (plump:make-text-node (plump:parent node))))
605 (funcall insert-sibling node new-node)
606 new-node)))))
607 (is-numeric (node)
608 (let ((text (plump:text node))
609 (alpha 0)
610 (digit 0))
611 (loop for c across text
612 when (alpha-char-p c)
613 do (incf alpha)
614 when (digit-char-p c)
615 do (incf digit)
616 finally (return (>= digit alpha)))))
617 (remove-if-whitespace (node)
618 (when (string-is-whitespace (plump:text node))
619 (plump:remove-child node)))
620 (first-non-whitespace-child (node)
621 (loop for e across (plump:children node)
622 when (or (typep e 'plump:element) (not (string-is-whitespace (plump:text e)))) return e))
623 (find-text-node (node direction)
624 (let ((iterator #.`(case direction
625 ,@(loop for d in '(:first :last) collect
626 `(,d (lambda (fn)
627 (iterate (for c in-vector (plump:children node) ,@(if (eq d :last) '(downto 0)))
628 (funcall fn c))))))))
629 (declare (dynamic-extent iterator))
630 (cond
631 ((and (plump:text-node-p node) (plump:parent node))
632 node)
633 ((plump:nesting-node-p node)
634 (block nil
635 (funcall iterator (lambda (c)
636 (when-let (tn (find-text-node c direction)) (return tn)))))))))
637 (vacuum-whitespace (node)
638 (dolist (direction '(:first :last))
639 (let ((displaced-text (make-string 0)))
640 (loop for lt = (find-text-node node direction) do
641 (cond
642 ((not lt)
643 (return-from vacuum-whitespace node))
644 ((string-is-whitespace (plump:text lt))
645 (setf displaced-text (case direction
646 (:first (concatenate 'string displaced-text (plump:text lt)))
647 (:last (concatenate 'string (plump:text lt) displaced-text))))
648 (plump:remove-child lt))
650 (let* ((text (plump:text lt)))
651 (case direction
652 (:first
653 (let ((boundary (loop for i from 0 to (- (length text) 1)
654 unless (char-is-whitespace (aref text i))
655 return i)))
656 (setf displaced-text (concatenate 'string displaced-text (subseq text 0 boundary))
657 (plump:text lt) (subseq text boundary))))
658 (:last
659 (let ((boundary (loop for i from (- (length text) 1) downto 0
660 unless (char-is-whitespace (aref text i))
661 return i)))
662 (setf displaced-text (concatenate 'string (subseq text (+ 1 boundary) (length text)))
663 (plump:text lt) (subseq text 0 (+ 1 boundary)))))))
664 (return))))
665 (when (> (length displaced-text) 0)
666 (let ((atn (adjacent-text-node node (case direction (:first :previous) (:last :next)))))
667 (setf (plump:text atn) (case direction
668 (:first (concatenate 'string (plump:text atn) displaced-text))
669 (:last (concatenate 'string displaced-text (plump:text atn))))))))))
670 (add-element-style (node attribute value)
671 (let ((old-style (plump:attribute node "style")))
672 (setf (plump:attribute node "style")
673 (if old-style
674 (format nil "~A~:[;~;~] ~A: ~A;" old-style (ppcre:scan ";\s*$" old-style) attribute value)
675 (format nil "~A: ~A;" attribute value)))))
676 (style-string-to-alist (string)
677 (let ((rules (ppcre:split "\\s*;\\s*" string :sharedp t)))
678 (iter (for rule in rules)
679 (let ((parts (ppcre:split "\\s*:\\s*" rule :sharedp t)))
680 (when (= 2 (length parts))
681 (collect (cons (first parts) (second parts))))))))
682 (alist-to-style-string (alist)
683 (with-output-to-string (s)
684 (iter (for item in alist)
685 (format s "~A:~A;" (car item) (cdr item)))))
686 (remove-style-rules (node &rest rules)
687 (declare (dynamic-extent rules))
688 (when-let ((old-style (plump:attribute node "style")))
689 (setf (plump:attribute node "style")
690 (alist-to-style-string
691 (remove-if (lambda (x) (member (car x) rules :test #'string-equal))
692 (style-string-to-alist old-style))))))
693 (flatten-element (node)
694 (let* ((previous-sibling (plump:previous-sibling node))
695 (next-sibling (if (plump:text-node-p (plump:next-sibling node))
696 (plump:next-sibling node)))
697 (new-text-node (if (plump:text-node-p previous-sibling)
698 previous-sibling
699 (plump:insert-before node
700 (plump:remove-child (plump:make-text-node (plump:parent node)))))))
701 (setf (plump:parent new-text-node) (plump:parent node)
702 (plump:text new-text-node) (concatenate 'string
703 (plump:text new-text-node)
704 (plump:text node)
705 (if next-sibling
706 (plump:text next-sibling)
707 "")))
708 (plump:remove-child node)
709 (when next-sibling (plump:remove-child next-sibling))))
710 (scan-for-urls (text-node)
711 (declare (type plump:text-node text-node))
712 (let ((text (plump:text text-node)))
713 (multiple-value-bind (url-start url-end)
714 (ppcre:scan #'url-scanner text)
715 (declare (type simple-string text)
716 (type (or null fixnum) url-start url-end))
717 (when url-start
718 (let* ((url-raw (subseq text url-start url-end))
719 (url (if (mismatch "http" url-raw :end2 4) (concatenate 'string "http://" url-raw) url-raw))
720 (family (plump:family text-node))
721 (other-children (prog1
722 (subseq family (1+ (plump:child-position text-node)))
723 (setf (fill-pointer family) (1+ (plump:child-position text-node)))))
724 (new-a (plump:make-element (plump:parent text-node) "a"))
725 (new-text (unless (= url-end (length text)) (plump:make-text-node (plump:parent text-node) (subseq text url-end)))))
726 (setf (plump:text text-node) (subseq text 0 url-start)
727 (plump:attribute new-a "href") (with-direct-link (presentable-link url))
728 (plump:attribute new-a "class") "bare-url")
729 (plump:make-text-node new-a (clean-text url-raw))
730 (when new-text
731 (scan-for-urls new-text)
732 (setf (plump:text new-text) (clean-text (plump:text new-text))))
733 (loop for item across other-children
734 do (plump:append-child (plump:parent text-node) item))
735 (when (= url-start 0)
736 (plump:remove-child text-node)))))))
737 (style-hash-to-html (style-hash out-stream)
738 (declare (type hash-table style-hash))
739 (let ((style-list (alexandria:hash-table-keys style-hash)))
740 (if style-list
741 (format out-stream "<style>~{~A~}</style>" style-list))))
742 (spoilerp (n)
743 (if-let (a (and (plump:element-p n) (plump:attribute n "class")))
744 (ppcre:scan "(?:^| )spoiler\\S*(?: |$)" a))))
745 (declare (ftype (function (plump:node &rest simple-string) boolean) only-child-is is-child-of-tag text-node-is-not))
746 (handler-bind
747 (((or plump:invalid-xml-character plump:discouraged-xml-character) #'abort))
748 (alexandria:if-let
749 (override (gethash post-id *html-overrides*))
750 (funcall override)
751 (let ((root (plump:parse (string-trim '(#\Space #\Newline #\Tab #\Return #\Linefeed #\Page) in-html)))
752 (contents nil)
753 (section-count 0)
754 (min-header-level 6)
755 (aggressive-deformat nil)
756 (style-hash (make-hash-table :test 'equal))
757 (used-colors (make-hash-table :test 'equal))
758 (used-anchors (make-hash-table :test 'equal)))
759 (declare (type fixnum section-count min-header-level))
760 (when *before-clean-hook*
761 (funcall *before-clean-hook*))
762 (let ((wayward-li-container nil))
763 (with-mathjax-processor ()
764 (plump:traverse
765 root
766 (lambda (node)
767 (cond
768 ((tag-is node "img")
769 (when-let ((src (plump:attribute node "src")))
770 (handle-codecogs node src)))
771 ((not (plump:parent node)) nil)
772 ((tag-is node "a")
773 (cond
774 ((not (plump:attribute node "href"))
775 (move-children-out-of-node node :keep t))
776 ((or
777 (and (ppcre:scan "^\s*https?://" (plump:text node))
778 (not (find #\HORIZONTAL_ELLIPSIS (plump:text node))))
779 (notany (lambda (attr) (nonempty-string (plump:attribute node attr))) '("href" "name" "id")))
780 (flatten-element node))
781 (t (tagbody start
782 (let* ((next-sibling (plump:next-sibling node))
783 (next-text-node (if (plump:text-node-p next-sibling) next-sibling))
784 (next-next-sibling (if next-text-node (plump:next-sibling next-text-node) next-sibling))
785 (next-a (if (and next-next-sibling (tag-is next-next-sibling "a")) next-next-sibling)))
786 (when (and next-a
787 (or (not next-text-node) (string-is-whitespace (plump:text next-text-node)))
788 (string= (plump:attribute node "href") (plump:attribute next-a "href")))
789 (when next-text-node
790 (plump:remove-child next-text-node)
791 (plump:append-child node next-text-node))
792 (loop for c across (plump:children next-a)
793 do (progn (plump:remove-child c)
794 (plump:append-child node c)))
795 (plump:remove-child next-a)
796 (go start)))))))
797 ((tag-is node "ul" "ol")
798 (setf wayward-li-container node)
799 (let ((new-children (plump:make-child-array)))
800 (loop for child across (plump:children node)
801 do (if (and (plump:element-p child) (tag-is child "li"))
802 (vector-push-extend child new-children)
803 (unless (and (plump:text-node-p child) (string-is-whitespace (plump:text child)))
804 (if (= (length new-children) 0)
805 (vector-push-extend (plump:make-element node "li") new-children))
806 (plump:append-child (aref new-children (- (length new-children) 1)) child))))
807 (setf (plump:children node) new-children)))
808 ((tag-is node "li")
809 (unless (is-child-of-tag node "ul" "ol")
810 (unless wayward-li-container
811 (setf wayward-li-container (make-element-before node "ul")))
812 (plump:remove-child node)
813 (plump:append-child wayward-li-container node)))
814 ((tag-is node "p" "blockquote" "div")
815 (setf wayward-li-container nil))))
816 :test #'plump:element-p)))
817 (loop while (and (= 1 (length (plump:children root)))
818 (plump:element-p (plump:first-child root))
819 (tag-is (plump:first-child root) "div" "html" "body")
820 (not (spoilerp (plump:first-child root))))
821 do (setf (plump:children root) (plump:children (plump:first-child root)))
822 do (loop for c across (plump:children root) do (setf (plump:parent c) root))
823 do (when-let (fc (plump:first-child root))
824 (when (and (plump:element-p fc) (tag-is fc "head"))
825 (loop for c across (plump:children fc) do
826 (when (and (plump:element-p c) (tag-is c "style"))
827 (setf (plump:parent c) (plump:parent fc))
828 (plump:insert-after fc c)))
829 (plump:remove-child fc))))
830 (loop for c across (plump:children root) do
831 (when (and (plump:element-p c)
832 (tag-is c "span")
833 (string-is-whitespace (plump:text c)))
834 (move-children-out-of-node c)))
835 (loop for lc = (plump:last-child root)
836 while (and (plump:element-p lc) (tag-is lc "br"))
837 do (plump:remove-child lc))
838 (plump:traverse
839 root
840 (lambda (node)
841 (when (and (plump:text-node-p node)
842 (plump:parent node)
843 (text-node-is-not node "a" "style" "pre"))
844 (scan-for-urls node))))
845 (plump:traverse
846 root
847 (lambda (node)
848 (when (and (not (plump:root-p node)) (plump:parent node))
849 (typecase node
850 (plump:text-node
851 (when (and (text-node-is-not node "style" "pre" "code")
852 (text-class-is-not node "mjx-math"))
853 (let ((new-root (plump:parse (clean-html-regexps (plump:serialize node nil))))
854 (other-children (prog1
855 (subseq (plump:family node) (1+ (plump:child-position node)))
856 (setf (fill-pointer (plump:family node)) (plump:child-position node)))))
857 (loop for item across (plump:children new-root)
858 do (plump:append-child (plump:parent node) item))
859 (loop for item across other-children
860 do (plump:append-child (plump:parent node) item)))))
861 (plump:element
862 (alexandria:when-let (style (plump:attribute node "style"))
863 (let ((style-list (style-string-to-alist style)))
864 (cond ((or aggressive-deformat
865 (cdr (assoc "font-family" style-list :test #'string-equal))
866 (search "font-style: inherit" style)
867 (search "MsoNormal" (plump:attribute node "class")))
868 (setf aggressive-deformat t)
869 (plump:remove-attribute node "style"))
870 ((ppcre:scan "(?:^|;)\\s*(?:line-height:[^;]+in)\\s*(?:;|$)" style)
871 (plump:remove-attribute node "style"))
873 (let (updated)
874 (dolist (style-item style-list)
875 (when (member (car style-item) '("color" "background" "background-color" "border" "border-color") :test #'string-equal)
876 (setf (cdr style-item)
877 (regex-replace-body ("#[0-9a-fA-F]{3,8}|rgba?\\((?:.*?)\\)|hsla?\\((?:.*?)\\)" (cdr style-item))
878 (multiple-value-bind (r g b a) (decode-css-color (match))
879 (when (and r g b a)
880 (let ((color-name (safe-color-name r g b a)))
881 (setf updated t
882 (gethash color-name used-colors) (list r g b a))
883 (format nil "var(--user-color-~A)" color-name))))))))
884 (when updated
885 (setf (plump:attribute node "style") (alist-to-style-string style-list))))))))
886 (when (and aggressive-deformat (tag-is node "div"))
887 (setf (plump:tag-name node) "p"))
888 (when (let ((parent (plump:parent node)))
889 (and
890 (intersection (class-list node) '("mjx-chtml" "mjx-math" "mjpage") :test #'string=)
891 (and parent
892 (class-is-not parent "mjx-chtml" "mjx-math" "mjpage"))))
893 (cond
894 ((let ((mrows (clss:select ".mjx-mrow" node)))
895 (and (not (zerop (length mrows)))
896 (every (lambda (mrow)
897 (zerop (length (plump:children mrow))))
898 mrows)))
899 (plump:remove-child node))
900 (:otherwise
901 (loop
902 with full-width = (or (class-is node "mjx-full-width")
903 (loop for e across (plump:children node)
904 when (member "MJXc-display" (class-list e) :test #'string=)
905 return t))
906 for current = node then (plump:parent current)
907 for parent = (plump:parent current)
908 when (and parent
909 (loop for s across (plump:family current)
910 unless (or (eq s current)
911 (and (plump:text-node-p s) (string-is-whitespace (plump:text s))))
912 return t))
913 do (progn (add-class current (if full-width
914 "mathjax-block-container"
915 "mathjax-inline-container"))
916 (return))
917 when (or (null parent)
918 (plump:root-p parent)
919 (tag-is parent "p" "blockquote" "div"))
920 do (progn (add-class current "mathjax-block-container")
921 (return))))))
922 (cond
923 ((tag-is node "a")
924 (vacuum-whitespace node)
925 (let ((href (plump:attribute node "href")))
926 (when href
927 (let* ((href (string-trim '(#\Space #\Newline #\Tab #\Return #\Linefeed #\Page) href))
928 (href (if (ppcre:scan "^(?:(?:[a-z]+:)?//|/|#)" href) href (format nil "http://~A" href)))
929 (href (or (with-direct-link (presentable-link href)) href)))
930 (when href
931 (setf (plump:attribute node "href") href)
932 (when *link-hook*
933 (log-and-ignore-errors
934 (funcall *link-hook* href))))))))
935 ((tag-is node "img")
936 (let ((width (ignore-errors (parse-integer (plump:attribute node "width"))))
937 (height (ignore-errors (parse-integer (plump:attribute node "height")))))
938 (if (and width height (<= width 1) (<= height 1))
939 ;; Remove probable tracking pixel.
940 (plump:remove-child node)
941 (let ((container
942 (if (and (tag-is (plump:parent node) "div" "p" "figure")
943 (only-child-is (plump:parent node) "img"))
944 (plump:parent node) ; Should already have imgonly class.
945 (let ((container (wrap-element node "div")))
946 (add-class container "imgonly")
947 container))))
948 (when-let ((src (plump:attribute node "src")))
949 (setf src (presentable-link src :image)
950 (plump:attribute node "src") src)
951 (create-dynamic-call container 'lw2.images::dynamic-image
953 (plump:tag-name container)
954 (alexandria:hash-table-alist (plump:attributes container))
955 (alexandria:hash-table-alist (plump:attributes node))))
956 (when (and width height)
957 ;; Apply responsive image scaling CSS.
958 (setf (plump:attribute container "style") (format nil "--aspect-ratio: ~F; max-width: ~Dpx"
959 (/ (float width)
960 (float height))
961 width)))
962 (remove-attributes node "style" "class" "width" "height")
963 (setf (plump:attribute node "loading") "lazy")))))
964 ((tag-is node "figure")
965 (remove-attributes node "style" "class" "width" "height"))
966 ((and (tag-is node "p") (only-child-is node "figure"))
967 (move-children-out-of-node node))
968 ((tag-is node "p" "blockquote" "div" "center")
969 (when (only-child-is node "center")
970 (unless (string-is-whitespace (plump:text node))
971 (add-element-style node "text-align" "center"))
972 (move-children-out-of-node (plump:first-child node)))
973 (when (tag-is node "center")
974 (setf (plump:tag-name node) "p")
975 (add-element-style node "text-align" "center"))
976 (when-let ((question-id (plump:attribute node "data-elicit-id")))
977 (when-let* ((question-title (lw2.backend::get-elicit-question-title question-id))
978 (links (plump:get-elements-by-tag-name node "a"))
979 (text-node (plump:first-child (first links))))
980 (setf (plump:text text-node) question-title))
981 (create-dynamic-call (plump:parent (plump:parent node)) 'lw2.elicit-predictions::render-elicit-block question-id))
982 (if (string-is-whitespace (plump:text node))
983 (if (or (plump:get-elements-by-tag-name node "img")
984 (plump:get-elements-by-tag-name node "iframe"))
985 (add-class node "imgonly")
986 (plump:remove-child node))
987 (if-let (parent (plump:parent node))
988 (progn
989 (when (and nil (tag-is node "p") ;; FIXME: disabled until we can fix math and code false positives
990 (rot13-text-p (plump:text node)))
991 (setf (plump:attribute node "class") "spoiler")
992 (plump:traverse node
993 (lambda (n) (unrot13-by-words (plump:text n)))
994 :test #'plump:text-node-p))
995 (cond
996 ((and (tag-is node "p")
997 (spoilerp node)
998 (spoilerp parent))
999 (plump:remove-attribute node "class"))
1000 ((and (tag-is node "div")
1001 (spoilerp node))
1002 (setf (plump:attribute node "class") "spoiler"))
1003 ((and (spoilerp node)
1004 (tag-is node "p")
1005 (not (spoilerp parent)))
1006 (let ((previous-sibling (plump:previous-sibling node)))
1007 (if (and previous-sibling (spoilerp previous-sibling))
1008 (progn (plump:remove-child node)
1009 (plump:append-child previous-sibling node)
1010 (plump:remove-attribute node "class"))
1011 (let ((new-container (plump:make-element parent "div")))
1012 (setf (plump:attribute new-container "class") "spoiler")
1013 (plump:remove-child new-container)
1014 (setf (plump:parent new-container) (plump:parent node))
1015 (plump:insert-before node new-container)
1016 (loop for e = node then ns
1017 while (and (plump:element-p e) (spoilerp e))
1018 for ns = (plump:next-sibling e)
1019 do (progn
1020 (plump:remove-attribute e "class")
1021 (plump:remove-child e)
1022 (plump:append-child new-container e))))))))))))
1023 ((tag-is node "table" "tbody" "tr" "td")
1024 (remove-style-rules node "border-top" "border-bottom" "border-left" "border-right" "padding")
1025 (when (and (tag-is node "td") (is-numeric node))
1026 (add-class node "numeric")))
1027 ((tag-is node "u")
1028 (let ((parent (plump:parent node)))
1029 (cond
1030 ((and (or (plump:root-p parent) (and (plump:element-p parent) (tag-is parent "p" "blockquote" "div")))
1031 (loop for c across (plump:children node) never (and (plump:element-p c) (tag-is c "a"))))
1032 (vacuum-whitespace node))
1034 (move-children-out-of-node node)))))
1035 ((tag-is node "ol")
1036 (when-let (old-style (plump:attribute node "style"))
1037 (setf (plump:attribute node "style")
1038 (ppcre:regex-replace-all
1039 (load-time-value (ppcre:create-scanner "list-style-type\\s*:\\s*decimal\\s*;?" :single-line-mode t :case-insensitive-mode t))
1040 old-style
1041 "")))
1042 (when-let (start-string (plump:attribute node "start"))
1043 (when-let (start (ignore-errors (parse-integer start-string)))
1044 (plump:remove-attribute node "start")
1045 (add-element-style node "counter-reset" (format nil "ol ~A" (- start 1))))))
1046 ((and (tag-is node "li") (find "footnote-item" (class-list node) :test #'string-equal))
1047 (unless (find-if (lambda (e) (and (plump:element-p e) (find "footnote-content" (class-list e) :test #'string-equal)))
1048 (plump:children node))
1049 ;; footnote-content div does not exist. We must create it and move the footnote contents into place.
1050 (let* ((wrapper (wrap-children node "div"))
1051 (backrefs (clss:select ".footnote-backref" wrapper))
1052 (back-links (make-element-before wrapper "span")))
1053 (setf (plump:attribute wrapper "class") "footnote-content"
1054 (plump:attribute back-links "class") "footnote-back-link")
1055 (loop for backref across backrefs do
1056 (progn (plump:remove-child backref)
1057 (plump:append-child back-links backref))))))
1058 ((tag-is node "li")
1059 (when (let ((c (plump:first-child node))) (and c (if (plump:text-node-p c) (not (string-is-whitespace (plump:text c))) (not (tag-is c "p" "ul" "ol")))))
1060 (wrap-children node "p")))
1061 ((tag-is node "pre")
1062 (let ((nchildren (length (plump:children node))))
1063 (when (>= nchildren 1)
1064 (remove-if-whitespace (plump:first-child node))
1065 (when (>= nchildren 2)
1066 (remove-if-whitespace (plump:last-child node))))))
1067 ((ppcre:scan "^h[1-6]$" (plump:tag-name node))
1068 (when (plump:get-elements-by-tag-name node "p")
1069 (move-children-out-of-node node))
1070 (cond
1071 ((string-is-whitespace (plump:text node))
1072 (plump:remove-child node))
1074 (let ((fc (plump:first-child node))
1075 (lc (plump:last-child node)))
1076 (when (and (plump:element-p fc) (tag-is fc "br")) (plump:remove-child fc))
1077 (when (and (not (eql fc lc)) (plump:element-p lc) (tag-is lc "br")) (plump:remove-child lc)))
1078 (when with-toc
1079 (incf section-count)
1080 (unless (plump:attribute node "id") (setf (plump:attribute node "id") (format nil "section-~A" section-count)))
1081 (let* ((header-level (parse-integer (subseq (plump:tag-name node) 1)))
1082 (header-text (with-output-to-string (stream)
1083 (plump:traverse node
1084 (lambda (n)
1085 (typecase n
1086 (plump:text-node
1087 (when (text-node-is-not n "style" "script")
1088 (write-string (plump:text n) stream))))))))
1089 (anchor-old (or (plump:attribute node "id") (format nil "section-~A" section-count)))
1090 (anchor-new (title-to-anchor header-text used-anchors))
1091 (wrapper (wrap-children node "span")))
1092 (setf min-header-level (min min-header-level header-level)
1093 (plump:attribute node "id") anchor-new
1094 (plump:attribute wrapper "id") anchor-old)
1095 (push (list header-level
1096 header-text
1097 anchor-new)
1098 contents))))))
1099 ((and (tag-is node "span") (find "footnote-back-link" (class-list node) :test #'string-equal))
1100 (plump:traverse node #'move-children-out-of-node :test (lambda (n) (tag-is n "sup" "strong"))))
1101 ((tag-is node "style")
1102 (let ((text (plump:text node)))
1103 (when (search ".mjx-math" text)
1104 (setf (gethash text style-hash) t)))
1105 (plump:remove-child node))
1106 ((tag-is node "script")
1107 (plump:remove-child node))))))))
1108 (clean-dom-text root)
1109 (let ((with-toc (>= section-count 3))
1110 (out-string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t)))
1111 (with-output-to-string (out-stream out-string)
1112 (style-hash-to-html style-hash out-stream)
1113 (when (> (hash-table-count used-colors) 0)
1114 (format out-stream "<style>~%:root {~%")
1115 (maphash (lambda (name rgba-list)
1116 (declare (ignore rgba-list))
1117 (format out-stream " --user-color-~A: #~A;~%" name name))
1118 used-colors)
1119 (flet ((write-inverted-colors (theme)
1120 (format out-stream "body.theme-~A {~%" theme)
1121 (maphash (lambda (name rgba-list)
1122 (format out-stream " --user-color-~A: ~A;~%" name
1123 (multiple-value-call #'encode-css-color (apply #'perceptual-invert-rgba rgba-list))))
1124 used-colors)
1125 (format out-stream "}~%")))
1126 (format out-stream "}~%@media (prefers-color-scheme: dark) {~%")
1127 (write-inverted-colors "default")
1128 (format out-stream "}~%")
1129 (write-inverted-colors "dark"))
1130 (format out-stream "</style>"))
1131 (with-dynamic-block-serialization (current-memo-hash out-string)
1132 (loop for c across (plump:children root)
1133 when (and with-toc
1134 (not (or (string-is-whitespace (plump:text c))
1135 (tag-is c "figure"))))
1136 do (progn
1137 (contents-to-html (nreverse contents) min-header-level out-stream)
1138 (setf with-toc nil))
1139 do (plump:serialize c out-stream)))
1140 out-string)))))))