LJSUP-17669: Login.bml form refactoring
[livejournal.git] / cgi-bin / cleanhtml.pl
bloba54a2f108d6a4433dc9999133ed3985ac9beeeb6
1 #!/usr/bin/perl
3 # This file is provided under the GNU General Public License.
4 # A copy of that license can be found in the LICENSE-LiveJournal.txt file included as
5 # part of this distribution.
7 # Original code related to the 'cut_retrieve' option of the 'clean' method by Afuna in Dreamwidth (http://www.dreamwidth.org/)
9 use strict;
10 use Class::Autouse qw(
11 URI
12 HTMLCleaner
13 LJ::CSS::Cleaner
14 HTML::TokeParser
15 LJ::EmbedModule
16 LJ::Config
17 LJ::Maps
18 LJ::UserApps
19 LJ::CleanHtml::Like
22 LJ::Config->load;
24 package LJ;
26 use Encode;
27 use LJ::EmbedModule;
28 use HTML::Entities;
30 # <LJFUNC>
31 # name: LJ::strip_bad_code
32 # class: security
33 # des: Removes malicious/annoying HTML.
34 # info: This is just a wrapper function around [func[LJ::CleanHTML::clean]].
35 # args: textref
36 # des-textref: Scalar reference to text to be cleaned.
37 # returns: Nothing.
38 # </LJFUNC>
39 sub strip_bad_code
41 my $data = shift;
42 LJ::CleanHTML::clean($data, {
43 'eat' => [qw[layer script object embed]],
44 'mode' => 'allow',
45 'keepcomments' => 1, # Allows CSS to work
46 });
49 package LJ::CleanHTML;
50 # LJ::CleanHTML::clean(\$u->{'bio'}, {
51 # 'wordlength' => 100, # maximum length of an unbroken "word"
52 # 'addbreaks' => 1, # insert <br/> after newlines where appropriate
53 # 'tablecheck' => 1, # make sure they aren't closing </td> that weren't opened.
54 # 'eat' => [qw(head title style layer iframe)],
55 # 'mode' => 'allow',
56 # 'deny' => [qw(marquee)],
57 # 'remove' => [qw()],
58 # 'maximgwidth' => 100,
59 # 'maximgheight' => 100,
60 # 'keepcomments' => 1,
61 # 'cuturl' => 'http://www.domain.com/full_item_view.ext',
62 # 'ljcut_disable' => 1, # stops the cleaner from using the lj-cut tag
63 # 'cleancss' => 1,
64 # 'extractlinks' => 1, # remove a hrefs; implies noautolinks
65 # 'noautolinks' => 1, # do not auto linkify
66 # 'extractimages' => 1, # placeholder images
67 # 'transform_embed_nocheck' => 1, # do not do checks on object/embed tag transforming
68 # 'transform_embed_wmode' => <value>, # define a wmode value for videos (usually 'transparent' is the value you want)
69 # 'blocked_links' => [ qr/evil\.com/, qw/spammer\.com/ ], # list of sites which URL's will be blocked
70 # 'blocked_link_substitute' => 'http://domain.com/error.html' # blocked links will be replaced by this URL
71 # 'allowed_img_attrs' => hashref of allowed img attibutes, other attrs are removed.
72 # 'remove_all_attribs' => 1, # remove all attributes from html tags
73 # 'remove_attribs' => [qw/id class style/], # remove specified attributes only
74 # });
76 sub helper_preload
78 my $p = HTML::TokeParser->new("");
79 eval {$p->DESTROY(); };
83 # this treats normal characters and &entities; as single characters
84 # also treats UTF-8 chars as single characters if $LJ::UNICODE
85 my $onechar;
87 my $utf_longchar = '[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]';
88 my $match;
89 if (not $LJ::UNICODE) {
90 $match = '[^&\s]|(&\#?\w{1,7};)';
91 } else {
92 $match = $utf_longchar . '|[^&\s\x80-\xff]|(?:&\#?\w{1,7};)';
94 $onechar = qr/$match/o;
97 # Some browsers, such as Internet Explorer, have decided to alllow
98 # certain HTML tags to be an alias of another. This has manifested
99 # itself into a problem, as these aliases act in the browser in the
100 # same manner as the original tag, but are not treated the same by
101 # the HTML cleaner.
102 # 'alias' => 'real'
103 my %tag_substitute = (
104 'image' => 'img',
107 # In XHTML you can close a tag in the same opening tag like <br />,
108 # but some browsers still will interpret it as an opening only tag.
109 # This is a list of tags which you can actually close with a trailing
110 # slash and get the proper behavior from a browser.
111 my $slashclose_tags = qr/^(?:area|base|basefont|br|col|embed|frame|hr|img|input|isindex|link|meta|param|lj-embed)$/i;
113 our $EnableDynamicElements = undef;
115 # <LJFUNC>
116 # name: LJ::CleanHTML::clean
117 # class: text
118 # des: Multi-faceted HTML parse function
119 # info:
120 # args: data, opts
121 # des-data: A reference to HTML to parse to output, or HTML if modified in-place.
122 # des-opts: An hash of options to pass to the parser.
123 # returns: Nothing.
124 # </LJFUNC>
125 sub clean {
126 my $data = shift;
127 my $opts = shift;
128 my $newdata;
130 # remove the auth portion of any see_request.bml links
131 $$data =~ s/(see_request\.bml\S+?)auth=\w+/$1/ig;
132 $$data =~ s/(<lj\-random\s*\/?>)/int(rand(10_000_000))/gie;
133 $$data =~ s/(\&lt\;lj\-random\s*\/?\&gt\;)/int(rand(10_000_000))/gie;
135 # decode escapes to get a valid unicode string
136 # we encode it back before return
137 $$data = Encode::decode_utf8($$data);
139 my $p = HTML::TokeParser->new($data);
140 my $wordlength = $opts->{'wordlength'};
141 my $addbreaks = $opts->{'addbreaks'};
142 my $keepcomments = $opts->{'keepcomments'};
143 my $mode = $opts->{'mode'};
144 my $undefined_tags = $opts->{undefined_tags} || '';
145 my $cut = $opts->{'cuturl'} || $opts->{'cutpreview'};
146 my $ljcut_disable = $opts->{'ljcut_disable'};
147 my $s1var = $opts->{'s1var'};
148 my $extractlinks = 0 || $opts->{'extractlinks'};
149 my $noautolinks = $extractlinks || $opts->{'noautolinks'};
150 my $noexpand_embedded = $opts->{'noexpandembedded'} || $opts->{'textonly'} || 0;
151 my $transform_embed_nocheck = $opts->{'transform_embed_nocheck'} || 0;
152 my $transform_embed_wmode = $opts->{'transform_embed_wmode'};
153 my $remove_colors = $opts->{'remove_colors'} || 0;
154 my $remove_sizes = $opts->{'remove_sizes'} || 0;
155 my $remove_fonts = $opts->{'remove_fonts'} || 0;
156 my $blocked_links = (exists $opts->{'blocked_links'}) ? $opts->{'blocked_links'} : \@LJ::BLOCKED_LINKS;
157 my $blocked_link_substitute =
158 (exists $opts->{'blocked_link_substitute'}) ? $opts->{'blocked_link_substitute'} :
159 ($LJ::BLOCKED_LINK_SUBSTITUTE) ? $LJ::BLOCKED_LINK_SUBSTITUTE : '#';
160 my $suspend_msg = $opts->{'suspend_msg'} || 0;
161 my $unsuspend_supportid = $opts->{'unsuspend_supportid'} || 0;
162 my $remove_all_attribs = $opts->{'remove_all_attribs'} || 0;
163 my %remove_attribs = ($opts->{'remove_attribs'}) ?
164 (map {$_ => 1} @{ $opts->{'remove_attribs'} }) : ();
165 my $remove_positioning = $opts->{'remove_positioning'} || 0;
166 my $placeholders = $opts->{'placeholders'} || 0;
167 my $target = $opts->{'target'} || '';
168 my $ljrepost_allowed = ($opts->{ljrepost_allowed} && ! $opts->{'textonly'}) || 0;
169 my $cut_retrieve = $opts->{cut_retrieve} || 0;
170 my $expand_lj_user_tag = $opts->{'expand_lj_user_tag'} || 0;
171 my $skip_lj_user_tag = $opts->{'skip_lj_user_tag'} || 0;
173 my $enable_dynamic_elements = $EnableDynamicElements;
174 unless ( defined $enable_dynamic_elements ) {
175 $enable_dynamic_elements = LJ::is_web_context();
177 $enable_dynamic_elements = 0 if $opts->{'textonly'};
179 my $ljspoiler_allowed = $enable_dynamic_elements;
181 my $poster = $opts->{poster} || LJ::load_userid($opts->{posterid});
182 my $put_nofollow = not ($poster and $poster->get_cap('paid') and not $poster->get_cap('trynbuy'));
184 my $viewer_lang = $opts->{'viewer_lang'};
185 unless ($viewer_lang) {
186 $viewer_lang = LJ::Lang::get_remote_lang();
189 # cuturl or entry_url tells about context and texts address,
190 # Expand or close lj-cut tag should be switched directly by special flag
191 # - expand_cut
192 $cut = '' if $opts->{expand_cut};
194 my @canonical_urls; # extracted links
195 my %action = ();
196 my %remove = ();
197 if (ref $opts->{'eat'} eq "ARRAY") {
198 foreach (@{$opts->{'eat'}}) { $action{$_} = "eat"; }
200 if (ref $opts->{'allow'} eq "ARRAY") {
201 foreach (@{$opts->{'allow'}}) { $action{$_} = "allow"; }
203 if (ref $opts->{'deny'} eq "ARRAY") {
204 foreach (@{$opts->{'deny'}}) { $action{$_} = "deny"; }
206 if (ref $opts->{'remove'} eq "ARRAY") {
207 foreach (@{$opts->{'remove'}}) { $action{$_} = "deny"; $remove{$_} = 1; }
210 $action{'script'} = "eat";
212 # if removing sizes, remove heading tags
213 if ($remove_sizes) {
214 foreach my $tag (qw( h1 h2 h3 h4 h5 h6 )) {
215 $action{$tag} = "deny";
216 $remove{$tag} = 1;
220 if ($opts->{'strongcleancss'}) {
221 $opts->{'cleancss'} = 1;
224 my @attrstrip = qw();
225 # cleancss means clean annoying css
226 # clean_js_css means clean javascript from css
227 if ($opts->{'cleancss'}) {
228 push @attrstrip, 'id';
229 $opts->{'clean_js_css'} = 1;
232 if ($opts->{'nocss'}) {
233 push @attrstrip, 'style';
236 if (ref $opts->{'attrstrip'} eq "ARRAY") {
237 foreach (@{$opts->{'attrstrip'}}) { push @attrstrip, $_; }
240 my %opencount = ();
241 my @tablescope = ();
243 my $cutcount = 0;
245 # bytes known good. set this BEFORE we start parsing any new
246 # start tag, where most evil is (because where attributes can be)
247 # then, if we have to totally fail, we can cut stuff off after this.
248 my $good_until = 0;
250 # then, if we decide that part of an entry has invalid content, we'll
251 # escape that part and stuff it in here. this lets us finish cleaning
252 # the "good" part of the entry (since some tags might not get closed
253 # till after $good_until bytes into the text).
254 my $extra_text;
255 my $total_fail = sub {
256 my $tag = LJ::ehtml(@_);
258 my $edata = LJ::ehtml($$data);
259 $edata =~ s/\r?\n/<br \/>/g if $addbreaks;
261 $extra_text = "<div class='ljparseerror'>[<b>Error:</b> Irreparable invalid markup ('&lt;$tag&gt;') in entry. ".
262 "Owner must fix manually. Raw contents below.]<br /><br />" .
263 '<div style="width: 95%; overflow: auto">' . $edata . '</div></div>';
266 ## We do not need to eat a tag 'iframe' if it enabled here.
267 my $htmlcleaner = HTMLCleaner->new(
268 valid_stylesheet => \&LJ::valid_stylesheet_url,
269 enable_iframe => (grep { $_ eq 'iframe' && $action{$_} == "allow" ? 1 : 0 } keys %action) ? 1 : 0
272 my $eating_ljuser_span = 0; # bool, if we're eating an ljuser span
273 my $ljuser_text_node = ""; # the last text node we saw while eating ljuser tags
274 my @eatuntil = (); # if non-empty, we're eating everything. thing at end is thing
275 # we're looking to open again or close again.
277 my $capturing_during_eat; # if we save all tokens that happen inside the eating.
278 my @capture = (); # if so, they go here
280 my $form_tag = {
281 input => 1,
282 select => 1,
283 option => 1,
286 my $start_capture = sub {
287 next if $capturing_during_eat;
289 my ($tag, $first_token, $cb) = @_;
290 push @eatuntil, $tag;
291 @capture = ($first_token);
292 $capturing_during_eat = $cb || sub {};
295 my $finish_capture = sub {
296 @capture = ();
297 $capturing_during_eat = undef;
300 # this is the stack that holds information about state of parsing
301 # <lj-lang> tags; the syntax of these is as follows:
303 # <lj-lang-container>
304 # <lj-lang include="en"> English text </lj-lang>
305 # <lj-lang include="de"> German text </lj-lang>
306 # <lj-lang include="en,de"> Text that displays in both
307 # English and German </lj-lang>
308 # <lj-lang otherwise> In case all above failed, this is
309 # the text </lj-lang>
310 # </lj-lang-container>
312 # it is pretty trivial to implement the 'include' versions of
313 # tags, and for the 'otherwise' version, we have a state variable
314 # indicating that we haven't yet seen an 'include' tag that had
315 # its language matching the remote's language. so when we occur
316 # an 'otherwise' tag, we figure whether to display its body using
317 # this variable.
319 # as for the stack, it allows us to make it so that:
320 # 1). container tags may be nested
321 # 2). lj-lang doesn't actually need to be inside of a container
323 # opening <lj-lang-container> unshifts the stack
324 # closing <lj-lang-container> shifts it
325 # when we need to access a 'variable', $lj_lang_otherwise[0] will do
327 # TODO: this comment indicates that the code is less than easy to
328 # understand and it would benefit from a refactor, i.e. encapsulating
329 # handling specific tags in some set of classes, or something.
330 # - ailyin, Nov 15, 2010
331 my @lj_lang_otherwise = ( 1 );
333 my %vkontakte_like_js;
334 my $in_link = 0;
335 my $img_link = 0;
336 my $href_b_link = '';
337 my $text_a_link = 0;
338 my $text_b_link = 0;
340 my $ljspoilers_open = 0;
342 # if we're retrieving a cut tag, then we want to eat everything
343 # until we hit the first cut tag.
344 my @cuttag_stack = ();
345 my $eatall = $cut_retrieve ? 1 : 0;
347 TOKEN:
348 while (my $token = $p->get_token) {
349 my $type = $token->[0];
351 # See if this tag should be treated as an alias
353 if ( ($type eq 'S' || $type eq 'E') ) {
354 $token->[1] = $tag_substitute{$token->[1]} if defined $tag_substitute{$token->[1]};
357 # start tag
358 if ($type eq "S") {
359 my $tag = $token->[1];
360 my $attr = $token->[2]; # hashref
362 $good_until = length $newdata;
364 # stupid hack to remove the class='ljcut' from divs when we're
365 # disabling them, so we account for the open div normally later.
366 my $ljcut_div = $tag eq "div" && lc $attr->{class} eq "ljcut";
367 if ($ljcut_div && $ljcut_disable) {
368 $ljcut_div = 0;
371 if (LJ::is_enabled('remove_allowscriptaccess')) {
372 ## TODO: remove closing </param> tag,
373 ## don't strip 'allowscriptaccess' from YouTube and other trusted sites
374 if ($tag eq 'param' && $attr->{name} eq 'allowscriptaccess') {
375 next TOKEN;
377 if ($tag eq 'embed' && keys %$attr) {
378 # LJSUP-15368: don't delete allowScriptAccess from trusted sites
379 # probably it's must placed in transform_embed hook...
380 my $site = $attr->{src};
381 $site =~ m{(?:https?:)?//(?:[\w\-]+\.)*([\w\-]+\.\w*)}; #get site url from src
382 $site = $1;
383 unless ( grep($_ eq 'allowScriptAccess', @{$LJ::WHITELIST_VIDEO_HOSTS{$site}->{'other_whitelist'}}) ) {
384 delete $attr->{allowscriptaccess};
389 if (@eatuntil) {
390 push @capture, $token if $capturing_during_eat;
391 if ($tag eq $eatuntil[-1]) {
392 push @eatuntil, $tag;
394 next TOKEN;
397 # if we're looking for cut tags, ignore everything that's
398 # not a cut tag.
399 if ( $eatall && $tag ne "lj-cut" && !$ljcut_div ) {
400 next TOKEN;
404 if ( $opts->{'img_placeholders'} ) {
405 if ( $tag eq 'a' ) {
406 $in_link = 1;
407 $href_b_link = $attr->{href};
408 $text_a_link = 0;
411 if ( $tag eq 'img' && $in_link ) {
412 $img_link = 1;
413 $newdata .= '</a>';
417 if ($tag eq "lj-template" && ! $noexpand_embedded) {
418 my $name = $attr->{name} || "";
419 $name =~ s/-/_/g;
421 my $run_template_hook = sub {
422 # can pass in tokens to override passing the hook the @capture array
423 my ($token, $override_capture) = @_;
424 my $capture = $override_capture ? [$token] : \@capture;
426 # In $expanded we must has valid unicode string.
427 my $expanded = ($name =~ /^\w+$/) ?
428 Encode::decode_utf8(LJ::run_hook("expand_template_$name", $capture, remove_video_sizes => $opts->{remove_video_sizes})) : "";
429 $newdata .= $expanded || "<b>[Error: unknown template '" . LJ::ehtml($name) . "']</b>";
432 if ($attr->{'/'}) {
433 # template is self-closing, no need to do capture
434 $run_template_hook->($token, 1);
435 } else {
436 # capture and send content to hook
437 $start_capture->("lj-template", $token, $run_template_hook);
439 next TOKEN;
442 if ($tag eq "lj-replace") {
443 my $name = $attr->{name} || "";
444 my $replace = ($name =~ /^\w+$/) ? Encode::decode_utf8(LJ::lj_replace($name, $attr)) : undef;
445 $newdata .= defined $replace ? $replace : "<b>[Error: unknown lj-replace key '" . LJ::ehtml($name) . "']</b>";
447 next TOKEN;
450 if ($tag eq 'lj-map') {
451 $newdata .= LJ::Maps->expand_ljmap_tag($attr);
452 next TOKEN;
456 # lj-repost tag adds button that allows easily post text in remote user's blog.
458 # Usage:
459 # 1. <lj-repost />
460 # 2. <lj-repost button="post this" />
461 # 3. <lj-repost>some text</lj-repost>
462 # 4. <lj-repost button="re-post to your journal" subject="WOW">
463 # text to repost
464 # </lj-repost>
466 if ($tag eq "lj-repost" and $ljrepost_allowed){
467 next TOKEN if ref $opencount{$tag}; # no support for nested <lj-repost> tags
468 my $button = LJ::ehtml($attr->{button}) ||
469 Encode::decode_utf8(LJ::Lang::ml("repost.default_button"));
470 if ($attr->{'/'}){
471 # short <lj-repost /> form of tag
472 $newdata .= qq[<form action="http://www.$LJ::DOMAIN/update.bml" method="GET">]
473 . qq[<input type="hidden" name="repost" value="$opts->{cuturl}" />]
474 . qq[<input type="hidden" name="repost_type" value="a" />]
475 . qq[<input type="submit" value="$button" /> ]
476 . qq[</form>];
477 } else {
478 $opencount{$tag} = {
479 button => $button,
480 subject => $attr->{subject},
481 offset => length $newdata,
484 next TOKEN;
487 # LJSUP-11810: Change the widget trava.ru
488 # bypass S2 "print safe" function.
489 # <lj-music> must be expanded at last order
490 if ( $tag eq 'lj-music' && ! $opts->{'ignore_lj_music'} ) {
491 $newdata .= LJ::Setting::Music::format_ljmusic( $attr->{'provider'}, $attr->{'id'} );
493 next TOKEN;
495 elsif ( $tag eq 'lj-music' ) {
496 $newdata .= $token->[4];
499 ## lj-userpic:
500 ## <lj-userpic> - current journal's default userpic
501 ## <lj-userpic remote> - remote user's default userpic
502 ## <lj-userpic user="test"> - test's default userpic
503 if ($tag eq "lj-userpic" and !$opts->{'textonly'} and $action{$tag} ne 'deny') {
504 my $u = '';
505 if ($attr->{user}){
506 $u = LJ::load_user($attr->{user});
507 } elsif ($attr->{remote}){
508 $u = LJ::get_remote();
509 } else {
510 my $cur_journal = LJ::Session->domain_journal;
511 $u = LJ::load_user($cur_journal) if $cur_journal;
514 my $upic = ref $u ? $u->userpic : '';
515 if ($upic){
516 $newdata .= $upic->imgtag;
517 } else {
518 $newdata .= qq|<img src="http://wh.livejournal.ru/icons/nouserpic.png" width="100" height="100" class="userpic-img" />|;
520 next TOKEN;
523 if ($tag eq "lj-wishlist") {
524 my $wishid = $attr->{wishid};
525 my $userid = $attr->{userid};
526 $newdata .= Encode::decode_utf8(LJ::WishElement->check_and_expand_entry($userid, $wishid));
529 if ( $tag eq 'lj-spoiler' ) {
530 next TOKEN unless $ljspoiler_allowed;
532 my $title = exists $attr->{'title'} && length $attr->{'title'}
533 ? $attr->{'title'}
534 : $attr->{'text'} || Encode::decode_utf8( LJ::Lang::ml('fcklang.ljspoiler.prompt.text') );
536 $title = LJ::ehtml($title);
538 $newdata .= qq{<div class="lj-spoiler"><div class="lj-spoiler-head">[<b><a href="#">$title</a></b>]</div><div class="lj-spoiler-body">};
539 $ljspoilers_open++;
540 next TOKEN;
543 # Capture object and embed tags to possibly transform them into something else.
544 if ($tag eq "object" || $tag eq "embed") {
545 if (LJ::are_hooks("transform_embed") && !$noexpand_embedded) {
546 # XHTML style open/close tags done as a singleton shouldn't actually
547 # start a capture loop, because there won't be a close tag.
548 if ($attr->{'/'}) {
549 $newdata .= LJ::run_hook(
550 "transform_embed",
551 [$token],
552 nocheck => $transform_embed_nocheck,
553 wmode => $transform_embed_wmode,
554 video_placeholders => $opts->{video_placeholders},
555 remove_video_sizes => $opts->{remove_video_sizes},
556 no_encode => 1,
557 ) || "";
558 next TOKEN;
561 $start_capture->($tag, $token, sub {
562 my $expanded = LJ::run_hook(
563 "transform_embed",
564 \@capture,
565 nocheck => $transform_embed_nocheck,
566 wmode => $transform_embed_wmode,
567 video_placeholders => $opts->{video_placeholders},
568 remove_video_sizes => $opts->{remove_video_sizes},
569 no_encode => 1,
571 $newdata .= $expanded || "";
573 next TOKEN;
577 if ($tag eq "span" && lc $attr->{class} eq "ljuser" && ! $noexpand_embedded) {
578 $eating_ljuser_span = 1;
579 $ljuser_text_node = "";
582 if ($eating_ljuser_span) {
583 next TOKEN;
586 if (($tag eq "div" || $tag eq "span") && lc $attr->{class} eq "ljvideo") {
587 $start_capture->($tag, $token, sub {
588 my $expanded = LJ::run_hook("expand_template_video", \@capture);
589 $newdata .= $expanded || "<b>[Error: unknown template 'video']</b>";
591 next TOKEN;
594 # do some quick checking to see if this is an email address/URL, and if so, just
595 # escape it and ignore it
596 if ($tag =~ m!(?:\@|://)!) {
597 $newdata .= LJ::ehtml("<$tag>");
598 next;
601 if ($form_tag->{$tag}) {
602 if (! $opencount{form}) {
603 $newdata .= "&lt;$tag ... &gt;";
604 next;
607 if ($tag eq "input") {
608 if ($attr->{type} !~ /^\w+$/ || lc $attr->{type} eq "password") {
609 delete $attr->{type};
614 my $slashclose = 0; # If set to 1, use XML-style empty tag marker
615 # for tags like <name/>, pretend it's <name> and reinsert the slash later
616 $slashclose = 1 if ($tag =~ s!/$!!);
618 unless ($tag =~ /^\w([\w\-:_]*\w)?$/) {
619 $total_fail->($tag);
620 last TOKEN;
623 # for incorrect tags like <name/attrib=val> (note the lack of a space)
624 # delete everything after 'name' to prevent a security loophole which happens
625 # because IE understands them.
626 $tag =~ s!/.+$!!;
628 # Try to execute default action on undefined tags
629 next if (!$action{$tag} && $undefined_tags eq "eat");
631 if ( $action{$tag} eq "eat" || $tag =~ /^fb|g:/ ) {
632 $p->unget_token($token);
633 $p->get_tag("/$tag");
634 next;
637 if ($tag eq 'iframe' || $tag eq 'video' || $tag eq 'audio' || $tag eq 'source') {
639 ## Remove all autoplay tags
640 delete $attr->{'autoplay'};
642 ## Allow some iframes from trusted sources (if they are not eaten already)
643 ## YouTube (http://apiblog.youtube.com/2010/07/new-way-to-embed-youtube-videos.html),
644 ## Vimeo, VKontakte, Google Calendar, Google Docs, VK.com, etc.
645 ## see @LJ::EMBED_IFRAME_WHITELIST in lj-disabled-conf
646 my $src_allowed = 0;
648 if (my $src = $attr->{'src'}) {
649 foreach my $wl ( @LJ::EMBED_IFRAME_WHITELIST ) {
650 if ($src =~ $wl->{re}) {
651 if ($wl->{personal_posts_only}) {
652 last unless $opts->{journalid};
653 my $u = LJ::load_userid($opts->{journalid});
654 last unless $u && $u->is_personal;
656 $src_allowed = 1;
657 last;
660 ## tags video and audio may have no attribute 'src'
661 ## and using special tag <source>
662 } elsif ($tag =~ /^(?:video|audio)$/) {
663 $src_allowed = 1;
666 unless ($src_allowed) {
667 ## eat this tag
668 if (!$attr->{'/'}) {
669 ## if not autoclosed tag (<iframe />),
670 ## then skip everything till the closing tag
671 $p->get_tag("/iframe");
673 next TOKEN;
677 # try to call HTMLCleaner's element-specific cleaner on this open tag
678 my $clean_res = eval {
679 my $cleantag = $tag;
680 $cleantag =~ s/^.*://s;
681 $cleantag =~ s/[^\w]//go;
682 no strict 'subs';
683 my $meth = "CLEAN_$cleantag";
684 my $seq = $token->[3]; # attribute names, listref
685 my $code = $htmlcleaner->can($meth)
686 or return 1;
687 return $code->($htmlcleaner, $seq, $attr);
690 next if !$@ && !$clean_res;
692 # this is so the rte converts its source to the standard ljuser html
693 my $ljuser_div = $tag eq "div" && $attr->{class} eq "ljuser";
694 if ($ljuser_div) {
696 my $href = $p->get_tag("a");
697 my $href_attr = $href->[1]->{"href"};
698 my $username = LJ::get_user_by_url ( $href_attr );
699 $attr->{'user'} = $username ? $username : '';
701 my $ljuser_text = $p->get_text("/b");
702 $p->get_tag("/div");
703 $ljuser_text =~ s/\[info\]//;
704 $tag = "lj";
705 $attr->{'title'} = $ljuser_text;
709 # no cut URL, record the anchor, but then fall through
710 if (0 && $ljcut_div && !$cut) {
711 $cutcount++;
712 $newdata .= "<a name=\"cutid$cutcount\"></a>";
713 $ljcut_div = 0;
716 if ( $tag eq 'lj-lang' ) {
717 # extract a "standard" type of lang here;
718 # also, it's a weird way to convert en_LJ -> en
719 my $lang = LJ::lang_to_locale($viewer_lang);
720 $lang =~ s/_.*//;
722 if ($attr->{'include'}) {
723 my @include = split /[,;\s]+/, $attr->{'include'};
724 if ( grep { $_ eq $lang } @include ) {
725 $lj_lang_otherwise[0] = 0;
726 next TOKEN;
730 if ( $attr->{'otherwise'} || $attr->{'default'} ) {
731 next TOKEN if ($lj_lang_otherwise[0]);
734 push @eatuntil, $tag;
737 if ( $tag eq 'lj-lang-container' ) {
738 unshift @lj_lang_otherwise, 1;
741 if (($tag eq "lj-cut" || $ljcut_div)) {
742 next TOKEN if $ljcut_disable;
743 $cutcount++;
745 # if this is the cut tag we're looking for, then push it
746 # onto the stack (in case there are nested cut tags) and
747 # start including the content.
748 if ( $eatall ) {
749 if ( $cutcount == $cut_retrieve ) {
750 $eatall = 0;
751 push @cuttag_stack, $tag;
753 next TOKEN;
756 my $link_text = sub {
757 my $text = LJ::Lang::ml('fcklang.readmore');
758 $text = Encode::decode_utf8($text) if $text;
759 if (exists $attr->{'text'} && length $attr->{'text'}) {
760 $text = $attr->{'text'};
761 $text =~ s/</&lt;/g;
762 $text =~ s/>/&gt;/g;
764 return $text;
766 if ($cut) {
767 my $etext = $link_text->();
768 my $url = LJ::ehtml($cut);
769 $newdata .= "<div>" if $tag eq "div";
770 my $data_ids = "";
771 if ($opts->{entry_url} && $opts->{entry_url} ne '#') {
772 my $entry = LJ::Entry->new_from_url($opts->{entry_url});
773 my $ditemid = 0;
774 my $journalid = $entry->journalid;
775 if ($entry && $entry->valid) {
776 $ditemid = $entry->ditemid;
778 $data_ids = qq(data-widget='ljcut' data-widget-options='{ "journalid": "$journalid", "ditemid": "$ditemid", "cutid": "$cutcount", "placeholders" : $placeholders }');
780 $newdata .= "<b $data_ids class=\"ljcut-link lj-widget\"><span class='ljcut-brace'>(&nbsp;</span><span class=\"ljcut-decor\"><a href=\"$url#cutid$cutcount\" class=\"ljcut-link-expand\">$etext</a>";
781 $newdata .= "<a href=\"$url#cutid$cutcount\" class=\"ljcut-link-collapse\">".Encode::decode_utf8(LJ::Lang::ml("ljcut.collapse"))."</a>" unless $opts->{no_ljcut_collapse};
782 $newdata .= "</span><span class='ljcut-brace'>&nbsp;)</span></b>";
783 $newdata .= "</div>" if $tag eq "div";
784 unless ($opts->{'cutpreview'}) {
785 push @eatuntil, $tag;
786 next TOKEN;
788 } else {
789 $newdata .= "<a name=\"cutid$cutcount\"></a>" unless $opts->{'textonly'};
790 if ($tag eq "div" && !$opts->{'textonly'}) {
791 $opencount{"div"}++;
792 my $etext = $link_text->();
793 $newdata .= "<div class=\"ljcut\" text=\"$etext\">";
795 next;
798 elsif ($tag eq "style") {
799 my $style = $p->get_text("/style");
800 $p->get_tag("/style");
801 unless ($LJ::DISABLED{'css_cleaner'}) {
802 my $cleaner = LJ::CSS::Cleaner->new;
803 $style = $cleaner->clean($style);
804 LJ::run_hook('css_cleaner_transform', \$style);
805 if ($LJ::IS_DEV_SERVER) {
806 $style = "/* cleaned */\n" . $style;
809 $newdata .= "\n<style>\n$style</style>\n";
810 next;
812 elsif ( ($tag eq "lj-app") || ($tag eq "lj-widget") )
814 next TOKEN unless LJ::is_enabled('userapps');
815 my %app_attr = map { $_ => Encode::encode_utf8($attr->{$_}) } keys %$attr;
817 if ($tag eq "lj-widget") {
818 $app_attr{type} = 'widget';
819 $app_attr{key} = delete $app_attr{name};
822 my $app = LJ::UserApps->get_application( id => delete $app_attr{id}, key => delete $app_attr{key} );
823 next TOKEN unless $app && $app->can_show_restricted;
825 # Gain all context data
826 my %context;
827 $context{posterid} = $opts->{posterid} if($opts->{posterid});
828 $context{journalid} = $opts->{journalid} if($opts->{journalid});
829 if($opts->{entry_url}) {
830 my $entry = LJ::Entry->new_from_url($opts->{entry_url});
831 if ($entry && $entry->valid) {
832 $context{ditemid} = $entry->ditemid;
836 $newdata .= Encode::decode_utf8($app->ljapp_display(viewer => LJ::get_remote(), owner => $poster, attrs => \%app_attr, context => \%context), Encode::FB_QUIET);
837 next TOKEN;
839 elsif ($tag eq "lj" && !$skip_lj_user_tag)
841 # keep <lj comm> working for backwards compatibility, but pretend
842 # it was <lj user> so we don't have to account for it below.
843 my $user = $attr->{'user'} = exists $attr->{'user'} ? $attr->{'user'} :
844 exists $attr->{'comm'} ? $attr->{'comm'} : undef;
846 if (length $user) {
847 my $orig_user = $user; # save for later, in case
848 $user = LJ::canonical_username($user);
849 if ($s1var) {
850 $newdata .= "%%ljuser:$1%%" if $attr->{'user'} =~ /^\%\%([\w\-\']+)\%\%$/;
851 } elsif (length $user) {
852 if ($opts->{'textonly'} && !$expand_lj_user_tag) {
853 $newdata .= $user;
854 } else {
855 my $title = Encode::encode_utf8($attr->{title});
856 my $ljuser = LJ::ljuser($user, { title => $title, target => $target } );
857 $newdata .= Encode::decode_utf8($ljuser);
859 } else {
860 $orig_user = LJ::no_utf8_flag($orig_user);
861 $newdata .= "<b>[Bad username: " . LJ::ehtml($orig_user) . "]</b>";
863 } else {
864 $newdata .= "<b>[Unknown LJ tag]</b>";
867 elsif ($tag eq "lj-raw") {
868 # Strip it out, but still register it as being open
869 $opencount{$tag}++;
871 elsif ($tag eq "lj-cvk-poll") {
872 $newdata .= Encode::decode_utf8(LJ::Widget::CVK->render_body());
874 elsif ( $tag eq 'lj-like' ) {
875 next TOKEN if $opts->{'textonly'};
877 unless ( exists $opts->{'entry_url'} && $opts->{'entry_url'} ) {
878 $newdata .= '<b>[lj-like in invalid context]</b>';
879 next TOKEN;
883 my $like = LJ::CleanHtml::Like->new({ 'entry_url' => $opts->{'entry_url'},
884 'buttons' => $attr->{'buttons'} ,
887 $newdata .= $like->html({ 'vkontakte_like_js' => \%vkontakte_like_js});
889 elsif ( $tag eq 'lj-lead' ) {
890 next TOKEN if $opencount{'lj-lead'};
892 $newdata .= qq{<div class="b-journalpreamble">};
893 $opencount{'lj-lead'}++;
895 elsif ( $tag eq 'lj-quote' ) {
896 $newdata .= qq{<div class="b-journalblockquote">};
897 $opencount{'lj-quote'}++;
899 elsif ( $tag eq 'lj-quote-cite' ) {
900 next TOKEN if !$opencount{'lj-quote'} || $opencount{'lj-quote-cite'};
902 $newdata .= qq{<cite class="b-journalblockquote-author">};
903 $opencount{'lj-quote-cite'}++;
905 elsif ( $tag eq 'lj-gallery' ) {
906 next TOKEN if $opencount{'lj-gallery'};
907 $opencount{'lj-gallery'}->{width} = $attr->{width};
908 $opencount{'lj-gallery'}->{height} = $attr->{height};
909 $newdata .= $token->[4];
911 elsif ( $tag eq 'lj-gallery-item' ) {
912 next TOKEN unless $opencount{'lj-gallery'};
914 my $src = $attr->{src};
915 my $width = $opencount{'lj-gallery'}->{width} ? qq{width="$opencount{'lj-gallery'}->{width}"} : '';
916 my $height = $opencount{'lj-gallery'}->{height} ? qq{height="$opencount{'lj-gallery'}->{height}"} : '';
918 $newdata .= qq{<lj-gallery-item><img src="$src" $width $height><lj-gallery-item-capture>};
920 elsif ( $tag eq 'lj-image' ) {
921 $opencount{'lj-image-a'} = 0;
923 my $src = $attr->{src};
924 my $href = $attr->{href};
926 my $height = $attr->{height};
927 my $width = $attr->{width};
929 my $center = defined $attr->{center} ? 'b-journalpicture-alignment' : '';
931 my $style = '';
933 if ($width || $height) {
934 $width = "width: ${width}px;" if $width;
935 $height = "height: ${height}px;" if $height;
936 $style = qq{style="$width $height"};
939 my $img = qq{<img $style class="b-journalpicture-image" src="$src">};
940 if ($href) {
941 $img = qq{<a href="$href">$img</a>};
944 $img = qq{<figure class="b-journalpicture b-journalpicture-alignment">$img<figcaption class="b-journalpicture-caption">};
946 $opencount{'lj-image'}++;
947 $newdata .= $img;
950 # Don't allow any tag with the "set" attribute
951 elsif ($tag =~ m/:set$/) {
952 next;
954 else
956 my $alt_output = 0;
958 my $hash = $token->[2];
959 my $attrs = $token->[3]; # attribute names, in original order
961 $slashclose = 1 if delete $hash->{'/'};
963 foreach (@attrstrip) {
964 # maybe there's a better place for this?
965 next if (lc $tag eq 'lj-embed' && lc $_ eq 'id');
966 delete $hash->{$_};
969 if ($tag eq "form") {
970 my $action = lc($hash->{'action'});
971 my $deny = 0;
972 if ($action =~ m!^https?://?([^/]+)!) {
973 my $host = $1;
974 $deny = 1 if
975 $host =~ /[%\@\s]/ ||
976 $LJ::FORM_DOMAIN_BANNED{$host};
977 } else {
978 $deny = 1;
980 delete $hash->{'action'} if $deny;
983 ATTR:
984 foreach my $attr (keys %$hash) {
985 if ( $remove_all_attribs || $remove_attribs{$attr} ) {
986 delete $hash->{$attr};
987 next;
990 if ($attr =~ /^(?:on|dynsrc)/) {
991 delete $hash->{$attr};
992 next;
995 if ($attr eq "data") {
996 delete $hash->{$attr} unless $tag eq "object";
997 next;
1000 unless ($opts->{entry_url}) {
1001 if ($attr eq 'width' || $attr eq 'height' ) {
1002 if ($hash->{$attr} > 1024*2) {
1003 $hash->{$attr} = 1024*2;
1008 ## warning: in commets left by anonymous users, <img src="something">
1009 ## is replaced by <a href="something"> (see 'extractimages' param)
1010 ## If "something" is "data:<script ...", we'll get a vulnerability
1011 if (($attr eq "href" || $attr eq 'src') && $hash->{$attr} =~ /^data/) {
1012 delete $hash->{$attr};
1013 next;
1016 if ($attr =~ /(?:^=)|[\x0b\x0d]/) {
1017 # Cleaner attack: <p ='>' onmouseover="javascript:alert(document/**/.cookie)" >
1018 # is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...)
1019 # which leads to reconstruction of valid HTML. Clever!
1020 # detect this, and fail.
1021 $total_fail->("$tag $attr");
1022 last TOKEN;
1025 # ignore attributes that do not fit this strict scheme
1026 unless ($attr =~ /^[\w_:-]+$/) {
1027 $total_fail->("$tag " . (%$hash > 1 ? "[...] " : "") . "$attr");
1028 last TOKEN;
1031 $hash->{$attr} =~ s/[\t\n]//g;
1033 # IE ignores the null character, so strip it out
1034 $hash->{$attr} =~ s/\x0//g;
1036 # IE sucks:
1037 my $nowhite = $hash->{$attr};
1038 $nowhite =~ s/[\s\x0b]+//go;
1039 if ($nowhite =~ /(?:jscript|livescript|javascript|vbscript|about):/ix) {
1040 delete $hash->{$attr};
1041 next;
1044 if ($attr eq 'style') {
1045 if ($opts->{'cleancss'}) {
1046 # css2 spec, section 4.1.3
1047 # position === p\osition :(
1048 # strip all slashes no matter what.
1049 $hash->{style} =~ s/\\//g;
1051 # and catch the obvious ones ("[" is for things like document["coo"+"kie"]
1052 foreach my $css ("/*", "[", qw(absolute fixed expression eval behavior cookie document window javascript -moz-binding)) {
1053 if ($hash->{style} =~ /\Q$css\E/i) {
1054 delete $hash->{style};
1055 next ATTR;
1059 if ($opts->{'strongcleancss'}) {
1060 if ($hash->{style} =~ /-moz-|absolute|relative|outline|z-index|(?<!-)(?:top|left|right|bottom)\s*:|filter|-webkit-/io) {
1061 delete $hash->{style};
1062 next ATTR;
1066 # remove specific CSS definitions
1067 if ($remove_colors) {
1068 $hash->{style} =~ s/(?:background-)?color:.*?(?:;|$)//gi;
1071 if ($remove_sizes) {
1072 $hash->{style} =~ s/font-size:.*?(?:;|$)//gi;
1075 if ($remove_fonts) {
1076 $hash->{style} =~ s/font-family:.*?(?:;|$)//gi;
1079 if ($remove_positioning) {
1080 $hash->{style} =~ s/margin.*?(?:;|$)//gi;
1081 $hash->{style} =~ s/height\s*?:.*?(?:;|$)//gi;
1083 # strip excessive padding
1084 $hash->{style} =~ s/padding[^:]*?:\D*\d{3,}[^;]*(?:;|$)//gi;
1088 if ($opts->{'clean_js_css'} && ! $LJ::DISABLED{'css_cleaner'}) {
1089 # and then run it through a harder CSS cleaner that does a full parse
1090 my $css = LJ::CSS::Cleaner->new;
1091 $hash->{style} = $css->clean_property($hash->{style});
1095 if (
1096 lc $tag ne 'lj-embed' &&
1097 ( $attr eq 'class' || $attr eq 'id' ) &&
1098 $opts->{'strongcleancss'} )
1100 unless (exists $LJ::CLASSNAME_WHITELIST{$hash->{$attr}}) {
1101 delete $hash->{$attr};
1103 next;
1106 # reserve ljs_* ids for divs, etc so users can't override them to replace content
1107 if ($attr eq 'id' && $hash->{$attr} =~ /^ljs_/i) {
1108 delete $hash->{$attr};
1109 next;
1112 if ($s1var) {
1113 if ($attr =~ /%%/) {
1114 delete $hash->{$attr};
1115 next ATTR;
1118 my $props = $LJ::S1::PROPS->{$s1var};
1120 if ($hash->{$attr} =~ /^%%([\w:]+:)?(\S+?)%%$/ && $props->{$2} =~ /[aud]/) {
1121 # don't change it.
1122 } elsif ($hash->{$attr} =~ /^%%cons:\w+%%[^\%]*$/) {
1123 # a site constant with something appended is also fine.
1124 } elsif ($hash->{$attr} =~ /%%/) {
1125 my $clean_var = sub {
1126 my ($mods, $prop) = @_;
1128 # HTML escape and kill line breaks
1129 $mods = "attr:$mods" unless
1130 $mods =~ /^(color|cons|siteroot|sitename|img):/ ||
1131 $props->{$prop} =~ /[ud]/;
1132 return '%%' . $mods . $prop . '%%';
1135 $hash->{$attr} =~ s/[\n\r]//g;
1136 $hash->{$attr} =~ s/%%([\w:]+:)?(\S+?)%%/$clean_var->(lc($1), $2)/eg;
1138 if ($attr =~ /^(href|src|lowsrc|style)$/) {
1139 $hash->{$attr} = "\%\%[attr[$hash->{$attr}]]\%\%";
1144 # remove specific attributes
1145 if (($remove_colors && ($attr eq "color" || $attr eq "bgcolor" || $attr eq "fgcolor" || $attr eq "text")) ||
1146 ($remove_sizes && $attr eq "size") ||
1147 ($remove_fonts && $attr eq "face")) {
1148 delete $hash->{$attr};
1149 next ATTR;
1153 ## attribute lj-sys-message-close is used in SiteMessage's only
1154 if (exists $hash->{'lj-sys-message-close'}) {
1155 delete $hash->{'lj-sys-message-close'};
1156 if (my $mid = $opts->{'lj_sys_message_id'}) {
1157 $hash->{'onclick'} = "LiveJournal.closeSiteMessage(this, event, $mid)";
1158 push @$attrs, 'onclick';
1162 if (exists $hash->{href}) {
1163 ## links to some resources will be completely blocked
1164 ## and replaced by value of 'blocked_link_substitute' param
1165 if ($blocked_links) {
1166 foreach my $re (@$blocked_links) {
1167 if ($hash->{href} =~ $re) {
1168 $hash->{href} = sprintf($blocked_link_substitute, LJ::eurl($hash->{href}));
1169 last;
1174 unless ($hash->{href} =~ s/^lj:(?:\/\/)?(.*)$/ExpandLJURL($1)/ei) {
1175 $hash->{href} = canonical_url($hash->{href}, 1);
1179 if ($tag eq "img") {
1180 my $img_bad = 0;
1182 if ($opts->{'extractimages'}) { $img_bad = 1; }
1184 if ( my $maxwidth = $opts->{'maximgwidth'} ) {
1185 my $width = $hash->{'width'};
1186 if ( $width && $width !~ /\%$/ ) {
1187 $width =~ s/[^\d.]//g;
1188 if ( int $width > $maxwidth ) {
1189 delete $hash->{'width'};
1190 delete $hash->{'height'};
1195 # don't use placeholders for small images
1196 if ( $opts->{'img_placeholders'} ) {
1197 if ( exists $hash->{style} ) {
1198 if ( $hash->{style} =~ /[^\-]width\:\s*(\d+)(?:px)?\;/i ) {
1199 $hash->{width} = $1;
1202 if ( $hash->{style} =~ /[^\-]height\:\s*(\d+)(?:px)?\;/i ) {
1203 $hash->{height} = $1;
1207 if ( exists $hash->{width} && $hash->{width} =~ /^[\d.]+$/ && exists $hash->{height} && $hash->{height} =~ /^[\d.]+$/ ) {
1208 if ( $hash->{'width'} > 140 && $hash->{'height'} > 37 ) {
1209 $img_bad = 1;
1211 else {
1212 $img_bad = 0;
1215 else {
1216 delete $hash->{width} if exists $hash->{width};
1217 delete $hash->{height} if exists $hash->{height};
1218 $img_bad = 1;
1220 } else {
1221 $img_bad = 0;
1224 ## Option 'allowed_img_attrs' provides a list of allowed attributes
1225 if (my $allowed = $opts->{'allowed_img_attrs'}){
1226 while (my ($attr, undef) = each %$hash){
1227 delete $hash->{$attr} unless $allowed->{$attr};
1231 ## TODO: a better check of $hash->{src} is needed,
1232 ## known (fixed) vulnerability is src="data:..."
1233 $hash->{src} = canonical_url($hash->{src}, 1);
1235 ## Ratings can be cheated by commenting a popular post with
1236 ## <img src="http://my-journal.livejournal.com/12345.html">
1237 if ($hash->{src} =~ m!/\d+\.html$!) {
1238 next TOKEN;
1241 ## CDN:
1242 ## http://pics.livejournal.com/<certain-journal>/pic/000fbt9x* -> l-pics.livejournal.com
1243 ## TODO: make it work for communities too
1244 if ($hash->{'src'} =~ m!^http://(?:l-)?pics.livejournal.com/(\w+)/pic/(.*)$!i) {
1245 my ($journal, $rest) = ($1, $2);
1246 my $host = (!$LJ::DISABLED{'pics_via_cdn'} && $LJ::USE_CDN_FOR_PICS{$journal})
1247 ? "l-pics.livejournal.com" : "pics.livejournal.com";
1248 $hash->{'src'} = "http://$host/$journal/pic/$rest";
1251 if ($img_bad) {
1252 $newdata .= qq~<a class="b-mediaplaceholder b-mediaplaceholder-photo ~ . ( $opts->{'remove_img_sizes'} ? '"' : qq~ b-mediaplaceholder-good" style="width:$hash->{'width'}px;height:$hash->{'height'}px;"~ ) . ( $hash->{'width'} ? qq~ data-width="$hash->{'width'}"~ : '' ) . ( $hash->{'height'} ? qq~ data-height="$hash->{'height'}"~: '' ) . qq~data-href="$href_b_link" href="~ .
1253 LJ::ehtml($hash->{'src'}) . '" onclick="return LiveJournal.placeholderClick(this, \'image\')">' .
1254 '<span class="b-mediaplaceholder-outer">' .
1255 '<span class="b-mediaplaceholder-inner">' .
1256 '<i class="b-mediaplaceholder-pic"></i>' .
1257 '<span class="b-mediaplaceholder-label b-mediaplaceholder-view">' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder.viewimage")) . '</span>'.
1258 '<span class="b-mediaplaceholder-label b-mediaplaceholder-loading">' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder.loading")) . '</span>'.
1259 '</span>' .
1260 '</span>' .
1261 '</a>';
1262 $newdata .= $href_b_link ?
1263 '<a href="' . $href_b_link .'" class="b-mediaplaceholder-external" title="' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder.link")) . '">' .
1264 '<i class="b-mediaplaceholder-bg"></i>' .
1265 '<i class="b-mediaplaceholder-pic"></i>' .
1266 '<span class="b-mediaplaceholder-inner">' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder.link")) . '</span>' .
1267 '</a>' : '';
1268 $alt_output = 1;
1269 $opencount{"img"}++;
1273 if ($tag eq "a" && $extractlinks)
1275 push @canonical_urls, canonical_url($attr->{href}, 1);
1276 $newdata .= "<b>";
1277 next;
1280 if ($tag eq "a" and $hash->{href} and $put_nofollow) {
1281 if ($hash->{href} =~ m!^(https?://)?([^/]+?)(/.*)?$!) {
1282 my $host = $1;
1283 unless ($host =~ /\Q$LJ::DOMAIN\E$/i) {
1284 $hash->{rel} = "nofollow";
1285 push @$attrs, 'rel';
1290 ## LJSUP-10811: due to security issue only Flash is allowed
1291 ## LJSV-1995: Embedded video from http://video.yandex.ru doesn't shown
1292 if ($tag eq 'embed'){
1293 $hash->{type} = 'application/x-shockwave-flash';
1294 push @$attrs => 'type' unless grep { $_ eq 'type' } @$attrs;
1296 if ($tag eq 'object' and ($hash->{data} || $hash->{src})){
1297 $hash->{type} = 'application/x-shockwave-flash';
1298 push @$attrs => 'type' unless grep { $_ eq 'type' } @$attrs;
1301 # LJSV-2152: When comment has embed in it - bubbles should be above buttons
1302 if ( $tag eq 'iframe' and $hash->{'src'} ) {
1303 foreach my $host (keys %LJ::WHITELIST_VIDEO_HOSTS) {
1304 if ( index ($hash->{'src'}, $host) != -1) {
1306 # Youtube accepts escaped parameters in form "%61utoplay=1"
1307 $hash->{'src'} = LJ::durl($hash->{'src'});
1309 # LJSUP-17010: For all links with media parameter "autoplay" must be deleted or = 0
1310 $hash->{'src'} =~ s/autoplay=1/autoplay=0/gi;
1312 # LJSUP-17018: Replacement autoplay = true on autoplay = false
1313 $hash->{'src'} =~ s/autoplay=true/autoplay=false/gi;
1315 if ( $hash->{'src'} !~ m!player\.seemedia\.pro! && $hash->{'src'} !~ m!wmode=opaque!i ) {
1316 if ( $hash->{'src'} =~ m!\?! ) {
1317 $hash->{'src'} .= '&wmode=opaque';
1318 } else {
1319 $hash->{'src'} .= '?wmode=opaque';
1323 last;
1328 # Through the xsl namespace in XML, it is possible to embed scripting lanaguages
1329 # as elements which will then be executed by the browser. Combining this with
1330 # customview.cgi makes it very easy for someone to replace their entire journal
1331 # in S1 with a page that embeds scripting as well. An example being an AJAX
1332 # six degrees tool, while cool it should not be allowed.
1334 # Example syntax:
1335 # <xsl:element name="script">
1336 # <xsl:attribute name="type">text/javascript</xsl:attribute>
1337 if ($tag eq 'xsl:attribute')
1339 $alt_output = 1; # We'll always deal with output for this token
1341 my $orig_value = $p->get_text; # Get the value of this element
1342 my $value = $orig_value; # Make a copy if this turns out to be alright
1343 $value =~ s/\s+//g; # Remove any whitespace
1345 # See if they are trying to output scripting, if so eat the xsl:attribute
1346 # container and its value
1347 if ($value =~ /(javascript|vbscript)/i) {
1349 # Remove the closing tag from the tree
1350 $p->get_token;
1352 # Remove the value itself from the tree
1353 $p->get_text;
1355 # No harm, no foul...Write back out the original
1356 } else {
1357 $newdata .= "$token->[4]$orig_value";
1361 unless ($alt_output) {
1362 my $allow;
1364 if ($mode eq "allow") {
1365 $allow = 1;
1366 if ($action{$tag} eq "deny") { $allow = 0; }
1367 } else {
1368 $allow = 0;
1369 if ($action{$tag} eq "allow") { $allow = 1; }
1372 my $newtag = '';
1374 if ($allow && ! $remove{$tag}) {
1375 if ($opts->{'tablecheck'}) {
1377 $allow = 0 if
1379 # can't open table elements from outside a table
1380 ($tag =~ /^(?:tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) ||
1382 # can't open td or th if not inside tr
1383 ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}) ||
1385 # can't open a table unless inside a td or th
1386 ($tag eq 'table' && @tablescope && ! grep { $tablescope[-1]->{$_} } qw(td th));
1389 if ($allow) { $newtag .= "<$tag"; }
1390 else { $newtag .= "&lt;$tag"; }
1392 # output attributes in original order, but only those
1393 # that are allowed (by still being in %$hash after cleaning)
1394 foreach (@$attrs) {
1395 $newtag .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\""
1396 if exists $hash->{$_};
1399 # ignore the effects of slashclose unless we're dealing with a tag that can
1400 # actually close itself. Otherwise, a tag like <em /> can pass through as valid
1401 # even though some browsers just render it as an opening tag
1402 if ($slashclose && $tag =~ $slashclose_tags) {
1403 $newtag .= " /";
1404 $opencount{$tag}--;
1405 $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope;
1407 if ($allow) {
1408 $newtag .= ">";
1409 $opencount{$tag}++;
1411 # maintain current table scope
1412 if ($opts->{'tablecheck'}) {
1414 # open table
1415 if ($tag eq 'table') {
1416 push @tablescope, {};
1418 # new tag within current table
1419 } elsif (@tablescope) {
1420 $tablescope[-1]->{$tag}++;
1425 else { $newtag .= "&gt;"; }
1427 # change iframe with video to placeholder according to user settings
1428 if ( lc $tag eq 'iframe' && $opts->{video_placeholders} ) {
1429 my $width = $hash->{width};
1430 my $height = $hash->{height};
1431 $width =~ s/px$//;
1432 $height =~ s/px$//;
1433 $width = 960 if $width > 960;
1434 $height = 750 if $height > 750;
1436 $width = $width =~ /^\d+$/ ? $width : 320;
1437 $height = $height =~ /^\d+$/ ? $height : 240;
1439 $newdata .= LJ::placeholder_link(
1440 placeholder_html => $newtag,
1441 width => $width,
1442 height => $height,
1443 img => "$LJ::IMGPREFIX/videoplaceholder.png",
1444 remove_video_sizes => $opts->{remove_video_sizes},
1445 no_encode => 1,
1448 else {
1449 $newdata .= $newtag;
1455 # end tag
1456 elsif ($type eq "E")
1458 my $tag = $token->[1];
1459 next TOKEN if $tag =~ /[^\w\-:]/;
1460 if (@eatuntil) {
1461 push @capture, $token if $capturing_during_eat;
1463 if ($eatuntil[-1] eq $tag) {
1464 pop @eatuntil;
1465 if (my $cb = $capturing_during_eat) {
1466 $cb->();
1467 $finish_capture->();
1469 next TOKEN;
1472 next TOKEN if @eatuntil;
1475 # if we're just getting the contents of a cut tag, then pop the
1476 # tag off the stack. if this is the last tag on the stack, then
1477 # go back to eating the rest of the content.
1478 if ( @cuttag_stack ) {
1479 if ( $cuttag_stack[-1] eq $tag ) {
1480 pop @cuttag_stack;
1481 last TOKEN unless ( @cuttag_stack );
1485 if ( $eatall ) {
1486 next TOKEN;
1489 if ($eating_ljuser_span && $tag eq "span") {
1490 $eating_ljuser_span = 0;
1491 $newdata .= $opts->{'textonly'} ? $ljuser_text_node : LJ::ljuser($ljuser_text_node);
1492 next TOKEN;
1495 if ( $opts->{'img_placeholders'} ) {
1496 if ( $tag eq 'a' && $in_link ) {
1497 $in_link = 0;
1498 $text_b_link = 0;
1499 $text_b_link = 0;
1500 $href_b_link = '';
1501 $img_link = 0;
1503 next TOKEN if $text_a_link;
1506 my $allow;
1507 if ($tag eq "lj-raw") {
1508 $opencount{$tag}--;
1509 $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope;
1511 elsif ($tag eq "lj-cut") {
1512 if ($opts->{'cutpreview'}) {
1513 $newdata .= "<b>&lt;/lj-cut&gt;</b>";
1514 } else {
1515 $newdata .= "<a name='cutid$cutcount-end'></a>"
1517 } #'"
1518 elsif ($tag eq "lj-repost" and $ljrepost_allowed and ref $opencount{$tag}) {
1519 ## Add repost button
1520 ## If there is opening <lj-repost> tag than $opencount{$tag} exists.
1522 my $button = LJ::ehtml($opencount{$tag}->{button}) || LJ::Lang::ml("repost.default_button");
1523 my $subject = LJ::ehtml($opencount{$tag}->{subject});
1524 my $captured = substr $newdata => $opencount{$tag}->{offset};
1526 if ($captured and my $entry = LJ::Entry->new_from_url($opts->{cuturl})){
1527 # !!! avoid calling any 'text' methods on $entry,
1528 # it can produce inifinite loop of cleanhtml calls.
1530 unless ($subject){
1531 $subject = LJ::ehtml($entry->subject_raw || LJ::Lang::ml("repost.default_subject"));
1534 if ($subject && Encode::is_utf8($subject)) {
1535 $subject = Encode::encode_utf8($subject);
1538 ## 'posterid' property of a removed (is_valied eq 'false') entry is empty.
1539 my $poster_username = $entry->poster
1540 ? $entry->poster->username
1541 : '';
1543 LJ::EmbedModule->add_user_to_embed($poster_username, \$captured);
1544 $captured = LJ::Lang::ml("repost.wrapper", {
1545 username => $poster_username,
1546 url => $entry->url,
1547 subject => $subject,
1548 text => Encode::encode_utf8($captured),
1551 $captured = Encode::decode_utf8($captured);
1552 $subject = Encode::decode_utf8($subject) if $subject;
1555 $captured = LJ::ehtml($captured);
1557 # add <form> with invisible fields and visible submit button
1558 if ( $captured ) {
1559 $newdata .= qq[
1560 <form action="http://www.$LJ::DOMAIN/update.bml" method="POST">
1561 <div style="display:none;visible:false">
1562 <input type="text" name="subject" value="$subject" />
1563 <textarea name="event">$captured</textarea>
1564 <input type="hidden" name="repost" value="$opts->{cuturl}" />
1565 <input type="hidden" name="repost_type" value="a" />
1566 </div>
1567 <input type="submit" value="$button" />
1568 </form>];
1569 } else {
1570 ## treat <lj-repost></lj-repost> as <lj-repost />
1571 $newdata .= qq[<form action="http://www.$LJ::DOMAIN/update.bml" method="GET">]
1572 . qq[<input type="hidden" name="repost" value="$opts->{cuturl}" />]
1573 . qq[<input type="hidden" name="repost_type" value="a" />]
1574 . qq(<input type="submit" value="$button" /> )
1575 . qq[</form>];
1578 delete $opencount{$tag};
1580 } elsif ( $tag eq 'lj-lang' ) {
1581 # ignore it
1582 } elsif ( $tag eq 'lj-lang-container' ) {
1583 shift @lj_lang_otherwise;
1584 } elsif ( $tag eq 'lj-spoiler' ) {
1585 if ($ljspoiler_allowed && $ljspoilers_open) {
1586 $newdata .= qq{</div></div>};
1587 $ljspoilers_open--;
1589 } elsif ( $tag eq 'lj-quote' ) {
1590 next TOKEN unless $opencount{'lj-quote'};
1592 if ($opencount{'lj-quote-block'}) {
1593 $newdata .= qq{</blockquote>};
1594 $opencount{'lj-quote-block'}--;
1596 $newdata .= qq{</div>};
1597 $opencount{'lj-quote'}--;
1598 } elsif ( $tag eq 'lj-quote-cite' ) {
1599 next TOKEN unless $opencount{'lj-quote-cite'};
1601 $newdata .= qq{</cite>};
1602 $opencount{'lj-quote-cite'}--;
1604 $newdata .= qq{<blockquote class="b-journalblockquote-quote">};
1605 $opencount{'lj-quote-block'}++;
1606 } elsif ( $tag eq 'lj-lead' ) {
1607 next TOKEN unless $opencount{'lj-lead'};
1609 $newdata .= qq{</div>};
1610 $opencount{'lj-lead'}--;
1611 } elsif ( $tag eq 'lj-gallery' ) {
1612 next TOKEN unless $opencount{'lj-gallery'};
1613 undef $opencount{'lj-gallery'};
1614 $newdata .= qq{</lj-gallery>};
1615 } elsif ( $tag eq 'lj-gallery-item' ) {
1616 $newdata .= qq{</lj-gallery-item-capture></lj-gallery-item>};
1617 } elsif ( $tag eq 'lj-image' ) {
1618 $newdata .= qq{</figcaption></figure>};
1619 } else {
1620 if ($mode eq "allow") {
1621 $allow = 1;
1622 if ($action{$tag} eq "deny") { $allow = 0; }
1623 } else {
1624 $allow = 0;
1625 if ($action{$tag} eq "allow") { $allow = 1; }
1628 if ($extractlinks && $tag eq "a") {
1629 if (@canonical_urls) {
1630 my $url = LJ::ehtml(pop @canonical_urls);
1631 $newdata .= "</b> ($url)";
1632 next;
1636 if ($allow && ! $remove{$tag})
1639 if ($opts->{'tablecheck'}) {
1641 $allow = 0 if
1643 # can't close table elements from outside a table
1644 ($tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) ||
1646 # can't close td or th unless open tr
1647 ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'});
1650 if ($allow && ! ($opts->{'noearlyclose'} && ! $opencount{$tag})) {
1652 # maintain current table scope
1653 if ($opts->{'tablecheck'}) {
1655 # open table
1656 if ($tag eq 'table') {
1657 pop @tablescope;
1659 # closing tag within current table
1660 } elsif (@tablescope) {
1661 $tablescope[-1]->{$tag}--;
1665 $newdata .= "</$tag>";
1666 $opencount{$tag}--;
1667 } else {
1668 $newdata .= "&lt;/$tag&gt;";
1673 elsif ($type eq "D") {
1674 # remove everything past first closing tag
1675 $token->[1] =~ s/>.+/>/s;
1676 # kill any opening tag except the starting one
1677 $token->[1] =~ s/.<//sg;
1678 $newdata .= $token->[1];
1680 elsif ($type eq "T") {
1681 my %url = ();
1682 my %nofollow;
1683 my $urlcount = 0;
1685 if (@eatuntil) {
1686 push @capture, $token if $capturing_during_eat;
1687 next TOKEN;
1690 if ( $eatall ) {
1691 next TOKEN;
1694 if ( $opts->{'img_placeholders'} ) {
1695 if ( $in_link && $img_link ) {
1696 $newdata .= qq~<a href="$href_b_link">~
1697 . $token->[1]
1698 . '</a>';
1699 $text_a_link = 1;
1700 next TOKEN;
1704 if ($eating_ljuser_span) {
1705 $ljuser_text_node = $token->[1];
1706 next TOKEN;
1709 if ($opencount{'style'} && $LJ::DEBUG{'s1_style_textnode'}) {
1710 my $uri = LJ::Request->uri;
1711 my $host = LJ::Request->header_in("Host");
1712 warn "Got text node while style elements open. Shouldn't happen anymore. ($host$uri)\n";
1715 my $auto_format = $addbreaks &&
1716 ($opencount{'table'} <= ($opencount{'td'} + $opencount{'th'})) &&
1717 ! $opencount{'pre'} &&
1718 ! $opencount{'lj-raw'};
1720 if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) {
1721 my $match = sub {
1722 my $str = shift;
1723 my $end = '';
1724 if ($str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/) {
1725 $url{++$urlcount} = $1;
1726 $end = $2;
1727 } else {
1728 $url{++$urlcount} = $str;
1730 $nofollow{$urlcount} = 0;
1731 if ($put_nofollow and $url{$urlcount} =~ m!^https?://([^/]+?)(/.*)?$!) {
1732 my $host = $1;
1733 unless ($host =~ /\Q$LJ::DOMAIN\E$/i) {
1734 $nofollow{$urlcount} = 1;
1737 return "&url$urlcount;$url{$urlcount}&urlend;$end";
1739 ## URL is http://anything-here-but-space-and-quotes/and-last-symbol-isn't-space-comma-period-etc
1740 ## like this (http://example.com) and these: http://foo.bar, http://bar.baz.
1741 $token->[1] =~ s!(https?://[^\s\'\"\<\>]+[^\s\'\"\<\>\.\,\?\:\)])! $match->($1); !ge;
1744 # escape tags in text tokens. shouldn't belong here!
1745 # especially because the parser returns things it's
1746 # confused about (broken, ill-formed HTML) as text.
1747 $token->[1] =~ s/</&lt;/g;
1748 $token->[1] =~ s/>/&gt;/g;
1750 # put <wbr> tags into long words, except inside <pre> and <textarea>.
1751 if ($wordlength && !$opencount{'pre'} && !$opencount{'textarea'}) {
1752 $token->[1] =~ s/(\S{$wordlength,})/break_word($1,$wordlength)/eg;
1755 if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) {
1756 ## Convert %username%.жж.рф and %username%.живойжурнал.рф to urls
1757 $token->[1] =~ s/(?<!http:\/\/)\b([\w]+\.\x{0436}\x{0436}\.\x{0440}\x{0444})/<a href="http:\/\/$1">$1<\/a>/g;
1758 $token->[1] =~ s/(?<!http:\/\/)\b([\w]+\.\x{0436}\x{0438}\x{0432}\x{043E}\x{0439}\x{0436}\x{0443}\x{0440}\x{043D}\x{0430}\x{043B}\.\x{0440}\x{0444})/<a href="http:\/\/$1">$1<\/a>/g;
1761 # auto-format things, unless we're in a textarea, when it doesn't make sense
1762 if ($auto_format && !$opencount{'textarea'}) {
1763 $token->[1] =~ s/\r?\n/<br \/>/g;
1764 if (! $opencount{'a'}) {
1765 my $tag_a = sub {
1766 my ($key, $title) = @_;
1767 my $nofollow = $nofollow{$key} ? " rel='nofollow'" : "";
1768 return "<a href='$url{$key}'$nofollow>$title</a>";
1770 $token->[1] =~ s|&url(\d+);(.*?)&urlend;|$tag_a->($1,$2)|ge;
1774 $newdata .= $token->[1];
1776 elsif ($type eq "C") {
1778 # probably a malformed tag rather than a comment, so escape it
1779 # -- ehtml things like "<3", "<--->", "<>", etc
1780 # -- comments must start with <! to be eaten
1781 if ($token->[1] =~ /^<[^!]/) {
1782 $newdata .= LJ::ehtml($token->[1]);
1784 # by default, ditch comments
1785 } elsif ($keepcomments) {
1786 my $com = $token->[1];
1787 $com =~ s/^<!--\s*//;
1788 $com =~ s/\s*--!>$//;
1789 $com =~ s/<!--//;
1790 $com =~ s/-->//;
1791 $newdata .= "<!-- $com -->";
1794 elsif ($type eq "PI") {
1795 my $tok = $token->[1];
1796 $tok =~ s/</&lt;/g;
1797 $tok =~ s/>/&gt;/g;
1798 $newdata .= "<?$tok>";
1800 else {
1801 $newdata .= "<!-- OTHER: " . $type . "-->\n";
1803 } # end while
1805 # finish up open links if we're extracting them
1806 if ($extractlinks && @canonical_urls) {
1807 while (my $url = LJ::ehtml(pop @canonical_urls)) {
1808 $newdata .= "</b> ($url)";
1809 $opencount{'a'}--;
1813 # close any tags that were opened and not closed
1814 # don't close tags that don't need a closing tag -- otherwise,
1815 # we output the closing tags in the wrong place (eg, a </td>
1816 # after the <table> was closed) causing unnecessary problems
1817 if (ref $opts->{'autoclose'} eq "ARRAY") {
1818 foreach my $tag (@{$opts->{'autoclose'}}) {
1819 next if $tag =~ /^(?:tr|td|th|tbody|thead|tfoot|li)$/o;
1820 if ($opencount{$tag}) {
1821 $newdata .= "</$tag>" x $opencount{$tag};
1826 if ($ljspoilers_open) {
1827 $newdata .= qq{</div></div>} x $ljspoilers_open;
1830 if ($opencount{'lj-quote-cite'}) {
1831 $newdata .= qq{</cite>} x $opencount{'lj-quote-cite'};
1834 if ($opencount{'lj-quote-block'}) {
1835 $newdata .= qq{</blockquote>} x $opencount{'lj-quote-block'};
1838 if ($opencount{'lj-quote'}) {
1839 $newdata .= qq{</div>} x $opencount{'lj-quote'};
1842 if ($opencount{'lj-lead'}) {
1843 $newdata .= qq{</div>} x $opencount{'lj-lead'};
1846 # extra-paranoid check
1847 1 while $newdata =~ s/<script\b//ig;
1849 $newdata =~ s/<x-vk-like id="(\d+)">/$vkontakte_like_js{$1}/eg;
1851 $$data = $newdata;
1852 $$data .= $extra_text if $extra_text; # invalid markup error
1854 # encode data back to utf8 before return
1855 $$data = Encode::encode_utf8($$data);
1857 if ($suspend_msg) {
1858 my $msg = qq{<div style="color: #000; font: 12px Verdana, Arial, Sans-Serif; background-color: #ffeeee; background-repeat: repeat-x; border: 1px solid #ff9999; padding: 8px; margin: 5px auto; width: auto; text-align: left; background-image: url('$LJ::IMGPREFIX/message-error.gif?v=4888');">};
1859 my $link_style = "color: #00c; text-decoration: underline; background: transparent; border: 0;";
1861 if ($unsuspend_supportid) {
1862 $msg .= LJ::Lang::ml('cleanhtml.suspend_msg_with_supportid', { aopts => "href='$LJ::SITEROOT/support/see_request.bml?id=$unsuspend_supportid' style='$link_style'" });
1863 } else {
1864 $msg .= LJ::Lang::ml('cleanhtml.suspend_msg', { aopts => "href='$LJ::SITEROOT/abuse/report.bml' style='$link_style'" });
1867 $msg .= "</div>";
1869 $$data = $msg . $$data;
1872 return 0;
1876 # takes a reference to HTML and a base URL, and modifies HTML in place to use absolute URLs from the given base
1877 sub resolve_relative_urls {
1878 my ($data, $base) = @_;
1879 my $p = HTML::TokeParser->new($data);
1881 # where we look for relative URLs
1882 my $rel_source = {
1883 'a' => {
1884 'href' => 1,
1886 'img' => {
1887 'src' => 1,
1891 my $global_did_mod = 0;
1892 my $base_uri = undef; # until needed
1893 my $newdata = "";
1895 TOKEN:
1896 while (my $token = $p->get_token)
1898 my $type = $token->[0];
1900 if ($type eq "S") # start tag
1902 my $tag = $token->[1];
1903 my $hash = $token->[2]; # attribute hashref
1904 my $attrs = $token->[3]; # attribute names, in original order
1906 my $did_mod = 0;
1907 # see if this is a tag that could contain relative URLs we fix up.
1908 if (my $relats = $rel_source->{$tag}) {
1909 while (my $k = each %$relats) {
1910 next unless defined $hash->{$k} && $hash->{$k} !~ /^[a-z]+:/;
1911 my $rel_url = $hash->{$k};
1912 $global_did_mod = $did_mod = 1;
1914 $base_uri ||= URI->new($base);
1915 $hash->{$k} = URI->new_abs($rel_url, $base_uri)->as_string;
1919 # if no change was necessary
1920 unless ($did_mod) {
1921 $newdata .= $token->[4];
1922 next TOKEN;
1925 # otherwise, rebuild the opening tag
1927 # for tags like <name/>, pretend it's <name> and reinsert the slash later
1928 my $slashclose = 0; # If set to 1, use XML-style empty tag marker
1929 $slashclose = 1 if $tag =~ s!/$!!;
1930 $slashclose = 1 if delete $hash->{'/'};
1932 # spit it back out
1933 $newdata .= "<$tag";
1934 # output attributes in original order
1935 foreach (@$attrs) {
1936 $newdata .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\""
1937 if exists $hash->{$_};
1939 $newdata .= " /" if $slashclose;
1940 $newdata .= ">";
1942 elsif ($type eq "E") {
1943 $newdata .= $token->[2];
1945 elsif ($type eq "D") {
1946 $newdata .= $token->[1];
1948 elsif ($type eq "T") {
1949 $newdata .= $token->[1];
1951 elsif ($type eq "C") {
1952 $newdata .= $token->[1];
1954 elsif ($type eq "PI") {
1955 $newdata .= $token->[2];
1957 } # end while
1959 $$data = $newdata if $global_did_mod;
1960 return undef;
1963 sub ExpandLJURL {
1964 my @args = grep { $_ } split(/\//, $_[0]);
1965 my $mode = shift @args;
1967 my %modes =
1969 'faq' => sub {
1970 my $id = shift()+0;
1971 if ($id) {
1972 return "support/faq/$id.html";
1973 } else {
1974 return "support/faq/";
1977 'memories' => sub {
1978 my $user = LJ::canonical_username(shift);
1979 if ($user) {
1980 return "memories.bml?user=$user";
1981 } else {
1982 return "memories.bml";
1985 'pubkey' => sub {
1986 my $user = LJ::canonical_username(shift);
1987 if ($user) {
1988 return "pubkey.bml?user=$user";
1989 } else {
1990 return "pubkey.bml";
1993 'support' => sub {
1994 my $id = shift()+0;
1995 if ($id) {
1996 return "support/see_request.bml?id=$id";
1997 } else {
1998 return "support/";
2001 'todo' => sub {
2002 my $user = LJ::canonical_username(shift);
2003 if ($user) {
2004 return "todo/?user=$user";
2005 } else {
2006 return "todo/";
2009 'user' => sub {
2010 my $user = LJ::canonical_username(shift);
2011 return "" if grep { /[\"\'\<\>\n\&]/ } @_;
2012 return $_[0] eq 'profile' ?
2013 "userinfo.bml?user=$user" :
2014 "users/$user/" . join("", map { "$_/" } @_ );
2016 'userinfo' => sub {
2017 my $user = LJ::canonical_username(shift);
2018 if ($user) {
2019 return "userinfo.bml?user=$user";
2020 } else {
2021 return "userinfo.bml";
2024 'userpics' => sub {
2025 my $user = LJ::canonical_username(shift);
2026 if ($user) {
2027 return "allpics.bml?user=$user";
2028 } else {
2029 return "allpics.bml";
2034 my $uri = $modes{$mode} ? $modes{$mode}->(@args) : "error:bogus-lj-url";
2036 return "$LJ::SITEROOT/$uri";
2039 my $subject_eat = [qw[head title style layer iframe applet object param base]];
2040 my $subject_allow = [qw[a b i u em strong cite]];
2041 my $subject_remove = [qw[bgsound embed object caption link font noscript lj-userpic]];
2042 sub clean_subject {
2043 my $ref = shift;
2044 return unless $$ref =~ /[\<\>]/;
2045 my $opts = shift || {};
2047 clean($ref, {
2048 'wordlength' => 40,
2049 'addbreaks' => 0,
2050 'eat' => $subject_eat,
2051 'mode' => 'deny',
2052 'allow' => $subject_allow,
2053 'remove' => $subject_remove,
2054 'autoclose' => $subject_allow,
2055 'noearlyclose' => 1,
2056 'remove_attribs' => [qw/id class style/],
2057 %$opts,
2061 ## returns a pure text subject (needed in links, email headers, etc...)
2062 my $subjectall_eat = [qw[head title style layer iframe applet object lj-spoiler ]];
2063 sub clean_subject_all {
2064 my $ref = shift;
2065 return unless $$ref =~ /[\<\>]/;
2066 clean($ref, {
2067 'wordlength' => 40,
2068 'addbreaks' => 0,
2069 'eat' => $subjectall_eat,
2070 'mode' => 'deny',
2071 'textonly' => 1,
2072 'autoclose' => $subject_allow,
2073 'noearlyclose' => 1,
2077 # wrapper around clean_subject_all; this also trims the subject to the given length
2078 sub clean_and_trim_subject {
2079 my $ref = shift;
2080 my $length = shift || 40;
2082 LJ::CleanHTML::clean_subject_all($ref);
2083 $$ref =~ s/\n.*//s;
2084 $$ref = LJ::text_trim($$ref, 0, $length);
2087 my $event_eat = [qw[head title style layer applet object xml param base]];
2088 my $event_remove = [qw[bgsound embed object link body meta noscript plaintext noframes]];
2090 my @comment_close = qw(
2091 a sub sup xmp bdo q span
2092 b i u tt s strike big small font
2093 abbr acronym cite code dfn em kbd samp strong var del ins
2094 h1 h2 h3 h4 h5 h6 div blockquote address pre center
2095 ul ol li dl dt dd
2096 table tr td th tbody tfoot thead colgroup caption
2097 area map form textarea blink
2099 my @comment_all = (@comment_close, qw{img br hr p col iframe audio video source});
2101 my $userbio_eat = $event_eat;
2102 my $userbio_remove = $event_remove;
2103 my @userbio_close = @comment_close;
2105 sub clean_event {
2106 my ($ref, $opts) = @_;
2108 # old prototype was passing in the ref and preformatted flag.
2109 # now the second argument is a hashref of options, so convert it to support the old way.
2110 unless (ref $opts eq "HASH") {
2111 $opts = { 'preformatted' => $opts };
2114 my $wordlength = defined $opts->{'wordlength'} ? $opts->{'wordlength'} : 40;
2116 # fast path: no markup or URLs to linkify, and no suspend message needed
2117 if ($$ref !~ /\<|\>|http/ && $$ref !~ /(.*?)\.?жж\.рф/ && $$ref !~ /(.*?)\.?живойжурнал\.рф/ && ! $opts->{preformatted} && !$opts->{suspend_msg}) {
2118 $$ref =~ s/(\S{$wordlength,})/break_word($1,$wordlength)/eg if $wordlength;
2119 $$ref =~ s/\r?\n/<br \/>/g;
2120 return;
2123 my $cleancss = $opts->{'journalid'} ?
2124 ! $LJ::STYLE_TRUSTED{ $opts->{'journalid'} } : 0;
2126 my $strongcleancss = $cleancss;
2128 my $poster = LJ::load_userid( $opts->{'posterid'} );
2129 my $journal = LJ::load_userid( $opts->{'journalid'} );
2130 my $active_journal = LJ::get_active_journal();
2131 if ( $poster &&
2132 $poster->get_cap('no_strong_clean_css') &&
2133 $poster->equals($journal) &&
2134 $poster->equals($active_journal) )
2136 $strongcleancss = 0;
2139 # slow path: need to be run it through the cleaner
2140 clean($ref, {
2141 'linkify' => 1,
2142 'wordlength' => $wordlength,
2143 'addbreaks' => $opts->{'preformatted'} ? 0 : 1,
2144 'cutpreview' => $opts->{'cutpreview'},
2145 'posterid' => $opts->{'posterid'},
2146 'eat' => $event_eat,
2147 'mode' => 'allow',
2148 'remove' => $event_remove,
2149 'autoclose' => \@comment_close,
2150 'cleancss' => $cleancss,
2151 'strongcleancss' => $strongcleancss,
2152 'noearlyclose' => 1,
2153 'tablecheck' => 1,
2154 'ljrepost_allowed' => 1,
2155 %$opts,
2159 sub pre_clean_event_for_entryform {
2160 my $ref = shift;
2162 ## fast path - no html tags
2163 return unless $$ref =~ /</;
2165 ## slow path
2166 my $data = Encode::decode_utf8($$ref);
2167 my $p = HTML::TokeParser->new(\$data);
2168 my $newdata = '';
2170 TOKEN:
2171 while (my $token = $p->get_token) {
2172 my $type = $token->[0];
2173 if ($type eq 'S') {
2174 ## start tag
2175 my $tag = $token->[1];
2176 my $hash = $token->[2]; # attributes
2177 my $attrs = $token->[3]; # attribute names, in original order
2179 ## check the tag
2180 if ($tag eq 'script') {
2181 $p->get_tag("/$tag");
2182 next TOKEN;
2184 if ($tag eq 'meta') {
2185 next TOKEN;
2187 if ($tag =~ /:set$/) {
2188 next TOKEN;
2190 unless ($tag =~ /^\w([\w\-:_]*\w)?\/?$/) {
2191 next TOKEN;
2193 ## check attributes
2194 my $autoclose = delete $hash->{'/'};
2195 foreach my $attr (keys %$hash) {
2196 if ($attr =~ /^(?:on|dynsrc)/) {
2197 delete $hash->{$attr};
2198 next;
2199 } elsif ($attr eq 'href' || $attr eq 'src') {
2200 if ($hash->{$attr} =~ /^data/) {
2201 delete $hash->{$attr};
2202 next;
2205 if ($attr =~ /(?:^=)|[\x0b\x0d]/) {
2206 next TOKEN;
2208 unless ($attr =~ /^[\w_:-]+$/) {
2209 delete $hash->{$attr};
2210 next;
2212 my $tmp = $hash->{$attr};
2213 $tmp =~ s/[\t\n\0]//g;
2214 if ($tmp =~ /(?:jscript|livescript|javascript|vbscript|about):/ix) {
2215 delete $hash->{$attr};
2216 next;
2218 ## TODO: css & xslt js expressions
2220 ## reconstruct the tag
2221 $newdata .= "<$tag";
2222 foreach (@$attrs) {
2223 $newdata .= " $_=\"" . LJ::ehtml($hash->{$_}) . "\"" if exists $hash->{$_};
2225 $newdata .= ($autoclose) ? " />" : ">";
2226 } elsif ($type eq 'E' or $type eq 'PI') {
2227 ## close (end) tags and processing instructions
2228 $newdata .= $token->[2];
2229 } else {
2230 $newdata .= $token->[1];
2234 # extra-paranoid check
2235 1 while $newdata =~ s/<script\b//ig;
2237 $$ref = Encode::encode_utf8($newdata);
2240 sub get_okay_comment_tags {
2241 return @comment_all;
2245 # ref: scalarref of text to clean, gets cleaned in-place
2246 # opts: either a hashref of opts:
2247 # - preformatted: if true, don't insert breaks and auto-linkify
2248 # - anon_comment: don't linkify things, and prevent <a> tags
2249 # or, opts can just be a boolean scalar, which implies the performatted tag
2250 sub clean_comment {
2251 my ($ref, $opts) = @_;
2253 unless (ref $opts) {
2254 $opts = { 'preformatted' => $opts,
2255 'nocss' => 1 };
2258 # fast path: no markup or URLs to linkify
2259 if ($$ref !~ /\<|\>|http/ && $$ref !~ /(.*?)\.?жж\.рф/ && $$ref !~ /(.*?)\.?живойжурнал\.рф/ && ! $opts->{preformatted}) {
2260 $$ref =~ s/(\S{40,})/break_word($1,40)/eg;
2261 $$ref =~ s/\r?\n/<br \/>/g;
2262 return 0;
2265 # slow path: need to be run it through the cleaner
2266 return clean($ref, {
2267 'linkify' => 1,
2268 'wordlength' => 40,
2269 'addbreaks' => $opts->{preformatted} ? 0 : 1,
2270 'eat' => [qw[head title style layer applet object]],
2271 'mode' => 'deny',
2272 'allow' => \@comment_all,
2273 'autoclose' => \@comment_close,
2274 'cleancss' => 1,
2275 'strongcleancss' => $opts->{'blocked_content'} ? 0 : 1,
2276 'extractlinks' => $opts->{'anon_comment'},
2277 'extractimages' => $opts->{'anon_comment'},
2278 'noearlyclose' => 1,
2279 'tablecheck' => 1,
2280 'nocss' => $opts->{'nocss'},
2281 'textonly' => $opts->{'textonly'} ? 1 : 0,
2282 'remove_positioning' => 1,
2283 'posterid' => $opts->{'posterid'},
2284 'img_placeholders' => $opts->{'img_placeholders'},
2285 'video_placeholders' => $opts->{'video_placeholders'},
2286 'remove_img_sizes' => $opts->{'remove_img_sizes'},
2287 'remove_video_sizes' => $opts->{'remove_video_sizes'},
2288 'no_encode' => $opts->{'no_encode'},
2292 # ref: scalarref of text to clean, gets cleaned in-place
2293 sub clean_message {
2294 my ($ref, $opts) = @_;
2296 # slow path: need to be run it through the cleaner
2297 return clean($ref, {
2298 'linkify' => 1,
2299 'wordlength' => 40,
2300 'addbreaks' => 0,
2301 'eat' => [qw[head title style layer applet object]],
2302 'mode' => 'deny',
2303 'allow' => \@comment_all,
2304 'autoclose' => \@comment_close,
2305 'cleancss' => 1,
2306 'strongcleancss' => 1,
2307 'noearlyclose' => 1,
2308 'tablecheck' => 1,
2309 'nocss' => $opts->{'nocss'},
2310 'textonly' => $opts->{'textonly'} ? 1 : 0,
2311 'remove_positioning' => 1,
2315 sub clean_userbio {
2316 my ($ref, %opts) = @_;
2318 return undef unless ref $ref;
2320 my %final_opts = (
2321 'wordlength' => 100,
2322 'addbreaks' => 1,
2323 'attrstrip' => [qw[style]],
2324 'mode' => 'allow',
2325 'noearlyclose' => 1,
2326 'tablecheck' => 1,
2327 'eat' => $userbio_eat,
2328 'remove' => $userbio_remove,
2329 'autoclose' => \@userbio_close,
2330 'cleancss' => 1,
2331 %opts,
2334 clean($ref, \%final_opts);
2337 sub clean_s1_style {
2338 my $s1 = shift;
2339 my $clean;
2341 my %tmpl;
2342 LJ::parse_vars(\$s1, \%tmpl);
2343 foreach my $v (keys %tmpl) {
2344 clean(\$tmpl{$v}, {
2345 'eat' => [qw[layer script object embed applet]],
2346 'mode' => 'allow',
2347 'keepcomments' => 1, # allows CSS to work
2348 'clean_js_css' => 1,
2349 's1var' => $v,
2353 return Storable::nfreeze(\%tmpl);
2356 sub s1_attribute_clean {
2357 my $a = $_[0];
2358 $a =~ s/[\t\n]//g;
2359 $a =~ s/\"/&quot;/g;
2360 $a =~ s/\'/&\#39;/g;
2361 $a =~ s/</&lt;/g;
2362 $a =~ s/>/&gt;/g;
2364 # IE sucks:
2365 if ($a =~ /((?:(?:v\s*b)|(?:j\s*a\s*v\s*a))\s*s\s*c\s*r\s*i\s*p\s*t|
2366 a\s*b\s*o\s*u\s*t)\s*:/ix) { return ""; }
2367 return $a;
2370 sub canonical_url {
2371 my $url = shift;
2372 my $allow_all = shift;
2374 # strip leading and trailing spaces
2375 $url =~ s/^\s*//;
2376 $url =~ s/\s*$//;
2378 return '' unless $url;
2380 unless ($allow_all) {
2381 # see what protocol they want, default to http
2382 my $pref = "http";
2383 $pref = $1 if $url =~ /^(https?|ftp|webcal):/;
2385 # strip out the protocol section
2386 $url =~ s!^.*?:/*!!;
2388 return '' unless $url;
2390 # rebuild safe url
2391 $url = "$pref://$url";
2394 if ($LJ::DEBUG{'aol_http_to_ftp'}) {
2395 # aol blocks http referred from lj, but ftp has no referer header.
2396 if ($url =~ m!^http://(?:www\.)?(?:members|hometown|users)\.aol\.com/!) {
2397 $url =~ s!^http!ftp!;
2401 return $url;
2404 sub break_word {
2405 my ($word, $at) = @_;
2406 return $word unless $at;
2408 $word =~ s/((?:$onechar){$at})\B/$1<wbr \/>/g;
2409 return $word;
2413 sub clean_friends {
2414 my $ref = shift;
2416 my @tags_remove = qw(bgsound embed object link body meta noscript plaintext noframes);
2417 my @tags_allow = qw(lj);
2419 LJ::CleanHTML::clean($ref, {
2420 'linkify' => 1,
2421 'wordlength' => 160,
2422 'undefined_tags' => 'eat',
2423 'allow' => \@tags_allow,
2424 'remove' => \@tags_remove,
2425 'cleancss' => 1,
2426 'noearlyclose' => 1,
2427 'tablecheck' => 1,
2428 'textonly' => 1,
2431 # Trim function must be a part of cleanHTML::clean method,
2432 # but now this method is too complicated to do this right way.
2433 # Now just cut off last breaked tag.
2435 # trim text
2436 my $trunc = LJ::text_trim($$ref, 640, 320);
2437 if ($$ref ne $trunc) {
2438 $trunc =~ s/(\W+\w+)$//; # cut off last space and chars right from it.
2440 # cut off last unclosed tag
2441 if ($trunc =~ m!\</?([^>]+)$!) { # ... <tag or ... </tag
2442 my $tag = $1;
2443 $trunc =~ s!</?\Q$tag\E>?.*?$!!;
2446 # add '...' to the tail
2447 $$ref = $trunc . ' ...';