cgi-bin/ljtextutil.pl

   1 package LJ;
   2 use strict;
   3 no warnings 'uninitialized';
   4
   5 use Class::Autouse qw(
   6     LJ::ConvUTF8
   7     HTML::TokeParser
   8     HTML::Parser
   9     LJ::Text
  10 );
  11
  12 use Encode;
  13
  14 # <LJFUNC>
  15 # name: LJ::trim
  16 # class: text
  17 # des: Removes whitespace from left and right side of a string.
  18 # args: string
  19 # des-string: string to be trimmed
  20 # returns: trimmed string
  21 # </LJFUNC>
  22 sub trim
  23 {
  24     my $a = $_[0];
  25     $a =~ s/^\s+//;
  26     $a =~ s/\s+$//;
  27     return $a;
  28 }
  29
  30 # <LJFUNC>
  31 # name: LJ::decode_url_string
  32 # class: web
  33 # des: Parse URL-style arg/value pairs into a hash.
  34 # args: buffer, hashref
  35 # des-buffer: Scalar or scalarref of buffer to parse.
  36 # des-hashref: Hashref to populate.
  37 # returns: boolean; true.
  38 # </LJFUNC>
  39 sub decode_url_string
  40 {
  41     my $a = shift;
  42     my $buffer = ref $a ? $a : \$a;
  43     my $hashref = shift;  # output hash
  44     my $keyref  = shift;  # array of keys as they were found
  45
  46     my $pair;
  47     my @pairs = split(/&/, $$buffer);
  48     @$keyref = @pairs;
  49     my ($name, $value);
  50     foreach $pair (@pairs)
  51     {
  52         ($name, $value) = split(/=/, $pair);
  53         $value =~ tr/+/ /;
  54         $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
  55         $name =~ tr/+/ /;
  56         $name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
  57         $hashref->{$name} .= $hashref->{$name} ? "\0$value" : $value;
  58     }
  59     return 1;
  60 }
  61
  62 # args: hashref of key/values
  63 #       arrayref of keys in order (optional)
  64 # returns: urlencoded string
  65 sub encode_url_string {
  66     my ($hashref, $keyref) = @_;
  67
  68     return join('&', map { LJ::eurl($_) . '=' . LJ::eurl($hashref->{$_}) }
  69                 (ref $keyref ? @$keyref : keys %$hashref));
  70 }
  71
  72 # <LJFUNC>
  73 # name: LJ::eurl
  74 # class: text
  75 # des: Escapes a value before it can be put in a URL.  See also [func[LJ::durl]].
  76 # args: string
  77 # des-string: string to be escaped
  78 # returns: string escaped
  79 # </LJFUNC>
  80 sub eurl
  81 {
  82     return LJ::Text->eurl(@_);
  83 }
  84
  85 # <LJFUNC>
  86 # name: LJ::durl
  87 # class: text
  88 # des: Decodes a value that's URL-escaped.  See also [func[LJ::eurl]].
  89 # args: string
  90 # des-string: string to be decoded
  91 # returns: string decoded
  92 # </LJFUNC>
  93 sub durl
  94 {
  95     return LJ::Text->durl(@_);
  96 }
  97
  98 # <LJFUNC>
  99 # name: LJ::exml
 100 # class: text
 101 # des: Escapes a value before it can be put in XML.
 102 # args: string
 103 # des-string: string to be escaped
 104 # returns: string escaped.
 105 # </LJFUNC>
 106 sub exml
 107 {
 108     # fast path for the commmon case:
 109     return $_[0] unless $_[0] =~ /[&\"\'<>\x00-\x08\x0B\x0C\x0E-\x1F]/;
 110     # what are those character ranges? XML 1.0 allows:
 111     # #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 112
 113     my $a = shift;
 114     $a =~ s/\&/&amp;/g;
 115     $a =~ s/\"/&quot;/g;
 116     $a =~ s/\'/&apos;/g;
 117     $a =~ s/</&lt;/g;
 118     $a =~ s/>/&gt;/g;
 119     $a =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g;
 120     return $a;
 121 }
 122
 123 # <LJFUNC>
 124 # name: LJ::ehtml
 125 # class: text
 126 # des: Escapes a value before it can be put in HTML.
 127 # args: string
 128 # des-string: string to be escaped
 129 # returns: string escaped.
 130 # </LJFUNC>
 131 sub ehtml
 132 {
 133     # fast path for the commmon case:
 134     return $_[0] unless $_[0] =~ /[&\"\'<>]/;
 135
 136     # this is faster than doing one substitution with a map:
 137     my $a = $_[0];
 138     $a =~ s/\&/&amp;/g;
 139     $a =~ s/\"/&quot;/g;
 140     $a =~ s/\'/&\#39;/g;
 141     $a =~ s/</&lt;/g;
 142     $a =~ s/>/&gt;/g;
 143     return $a;
 144 }
 145
 146 # <LJFUNC>
 147 # name: LJ::ehtm
 148 # class: text
 149 # des: Escapes a value before it can be put in HTML. Modified ehtml. Don't encode & if it parth of mnemonic.
 150 # args: string
 151 # des-string: string to be escaped
 152 # returns: string escaped.
 153 # </LJFUNC>
 154 sub ehtm
 155 {
 156     # fast path for the commmon case:
 157     return $_[0] unless $_[0] =~ /[&\"\'<>]/;
 158
 159     # this is faster than doing one substitution with a map:
 160     my $a = $_[0];
 161     $a =~ s/\&(?!(#?\w{2,7};))/&amp;/g;
 162     $a =~ s/\"/&quot;/g;
 163     $a =~ s/\'/&\#39;/g;
 164     $a =~ s/</&lt;/g;
 165     $a =~ s/>/&gt;/g;
 166     return $a;
 167 }
 168 *eall = \&ehtml;  # old BML syntax required eall to also escape BML.  not anymore.
 169
 170 # <LJFUNC>
 171 # name: LJ::dhtml
 172 # class: text
 173 # des: Remove HTML-escaping
 174 # args: string
 175 # des-string: string to be un-escaped
 176 # returns: string with HTML
 177 # </LJFUNC>
 178 sub dhtml
 179 {
 180     my $a = $_[0];
 181     $a =~ s/&quot;/"/g;
 182     $a =~ s/&\#39;/'/g;
 183     $a =~ s/&apos;/'/g;
 184     $a =~ s/&lt;/</g;
 185     $a =~ s/&gt;/>/g;
 186     $a =~ s/&amp;/&/g;
 187     return $a;
 188 }
 189
 190 # <LJFUNC>
 191 # name: LJ::etags
 192 # class: text
 193 # des: Escapes < and > from a string
 194 # args: string
 195 # des-string: string to be escaped
 196 # returns: string escaped.
 197 # </LJFUNC>
 198 sub etags
 199 {
 200     # fast path for the commmon case:
 201     return $_[0] unless $_[0] =~ /[<>]/;
 202
 203     my $a = $_[0];
 204     $a =~ s/</&lt;/g;
 205     $a =~ s/>/&gt;/g;
 206     return $a;
 207 }
 208
 209 # <LJFUNC>
 210 # name: LJ::ejs
 211 # class: text
 212 # des: Escapes a string value before it can be put in JavaScript.
 213 # args: string
 214 # des-string: string to be escaped
 215 # returns: string escaped.
 216 # </LJFUNC>
 217 sub ejs
 218 {
 219     my $a = $_[0];
 220     $a =~ s/[\"\'\\]/\\$&/g;
 221     $a =~ s/&quot;/\\&quot;/g;
 222     $a =~ s/\r?\n/\\n/gs;
 223     $a =~ s/\r//gs;
 224     return $a;
 225 }
 226
 227 # <LJFUNC>
 228 # name: LJ::ecvs
 229 # class: text
 230 # des: Escapes a string value before it can be put in .csv file.
 231 # args: string
 232 # des-string: string to be escaped
 233 # returns: string escaped.
 234 # </LJFUNC>
 235 sub ecsv
 236 {
 237     my $a = $_[0];
 238     $a =~ s/"/""/g;
 239     return '"'.$a.'"';
 240 }
 241
 242 # given a string, makes it into a string you can put into javascript,
 243 # including protecting against closing </script> tags in the entry.
 244 # does the double quotes for ya.
 245 sub ejs_string {
 246     my $str = ejs($_[0]);
 247     $str =~ s!</script!</scri\" + \"pt!gi;
 248     return "\"" . $str . "\"";
 249 }
 250
 251 # changes every char in a string to %XX where XX is the hex value
 252 # this is useful for passing strings to javascript through HTML, because
 253 # javascript's "unescape" function expects strings in this format
 254 sub ejs_all
 255 {
 256     my $a = $_[0];
 257     $a =~ s/(.)/uc sprintf("%%%02x",ord($1))/eg;
 258     return $a;
 259 }
 260
 261
 262 # strip all HTML tags from a string
 263 sub strip_html {
 264     my $str = shift;
 265     my $opts = shift || {};
 266 =head
 267     $str =~ s/\<lj user\=['"]?([\w-]+)['"]?\>/$1/g;   # "
 268     if ($opts->{use_space}) {
 269         $str =~ s/\<([^\<])+\>/ /g;
 270     } else {
 271         $str =~ s/\<([^\<])+\>//g;
 272     }
 273     return $str;
 274 =cut
 275
 276     my $p = HTML::Parser->new(api_version => 3,
 277                 handlers => {
 278                     text  => [sub { $_[0]->{res} .= $_[1] }, 'self, text'], # concat plain text
 279                     # handle tags
 280                     start => [sub {
 281                                     my ($self, $tag, $attrs, $origtext) = @_;
 282                                     if ($tag =~ /lj/i){
 283                                         if ($opts->{'expand_lj_user_tag'}) {
 284                                             $self->{res} .= $origtext;
 285                                         } else {
 286                                             $self->{res} .= $attrs->{user} || $attrs->{comm};  # <lj user="username" title=".."> -> username
 287                                         }
 288                                     } else {
 289                                         $self->{res} .= ' ' if $opts->{use_space}; # for other tags add spaces if needed.
 290                                     }
 291                                     if ($opts->{noparse_tags}) {
 292                                         for (@{$opts->{noparse_tags}}) {
 293                                             $self->{res} .= $origtext if $tag eq $_;
 294                                         }
 295                                     }
 296                                    },
 297                                   'self, tagname, attr, text'
 298                                ],
 299                     end => [sub {
 300                                  my ($self, $tag, $origtext) = @_;
 301                                  if ($opts->{noparse_tags}) {
 302                                      for (@{$opts->{noparse_tags}}) {
 303                                         $self->{res} .= $origtext if $tag eq $_;
 304                                      }
 305                                  }
 306                             }, 'self, tagname, text'
 307                            ]
 308                  },
 309             );
 310     $p->parse($str);
 311     $p->eof;
 312
 313     return $p->{res};
 314 }
 315
 316 # <LJFUNC>
 317 # name: LJ::is_ascii
 318 # des: checks if text is pure ASCII.
 319 # args: text
 320 # des-text: text to check for being pure 7-bit ASCII text.
 321 # returns: 1 if text is indeed pure 7-bit, 0 otherwise.
 322 # </LJFUNC>
 323 sub is_ascii {
 324     my $text = shift;
 325     return ($text !~ m/[^\x01-\x7f]/);
 326 }
 327
 328 # <LJFUNC>
 329 # name: LJ::is_utf8
 330 # des: check text for UTF-8 validity.
 331 # args: text
 332 # des-text: text to check for UTF-8 validity
 333 # returns: 1 if text is a valid UTF-8 stream, 0 otherwise.
 334 # </LJFUNC>
 335 sub is_utf8 {
 336     my $text = shift;
 337
 338     if (LJ::are_hooks("is_utf8")) {
 339         return LJ::run_hook("is_utf8", $text);
 340     }
 341
 342     return 1 if Encode::is_utf8($text);
 343
 344     my $eval_res = eval { Encode::decode_utf8($text); 1 };
 345     return $eval_res ? 1 : 0;
 346 }
 347
 348 # <LJFUNC>
 349 # name: LJ::text_out
 350 # des: force outgoing text into valid UTF-8.
 351 # args: text
 352 # des-text: reference to text to pass to output. Text if modified in-place.
 353 # returns: nothing.
 354 # </LJFUNC>
 355 sub text_out
 356 {
 357     my $rtext = shift;
 358
 359     # if we're not Unicode, do nothing
 360     return unless $LJ::UNICODE;
 361
 362     # is this valid UTF-8 already?
 363     return if LJ::is_utf8($$rtext);
 364
 365     # no. Blot out all non-ASCII chars
 366     $$rtext =~ s/[\x00\x80-\xff]/\?/g;
 367     return;
 368 }
 369
 370 # <LJFUNC>
 371 # name: LJ::text_in
 372 # des: do appropriate checks on input text. Should be called on all
 373 #      user-generated text.
 374 # args: text
 375 # des-text: text to check
 376 # returns: 1 if the text is valid, 0 if not.
 377 # </LJFUNC>
 378 sub text_in
 379 {
 380     my $text = shift;
 381     return 1 unless $LJ::UNICODE;
 382     if (ref ($text) eq "HASH") {
 383         return ! (grep { !LJ::is_utf8($_) } values %{$text});
 384     }
 385     if (ref ($text) eq "ARRAY") {
 386         return ! (grep { !LJ::is_utf8($_) } @{$text});
 387     }
 388     return LJ::is_utf8($text);
 389 }
 390
 391 # <LJFUNC>
 392 # name: LJ::text_convert
 393 # des: convert old entries/comments to UTF-8 using user's default encoding.
 394 # args: dbs?, text, u, error
 395 # des-dbs: optional. Deprecated; a master/slave set of database handles.
 396 # des-text: old possibly non-ASCII text to convert
 397 # des-u: user hashref of the journal's owner
 398 # des-error: ref to a scalar variable which is set to 1 on error
 399 #            (when user has no default encoding defined, but
 400 #            text needs to be translated).
 401 # returns: converted text or undef on error
 402 # </LJFUNC>
 403 sub text_convert
 404 {
 405     &nodb;
 406     my ($text, $u, $error) = @_;
 407
 408     # maybe it's pure ASCII?
 409     return $text if LJ::is_ascii($text);
 410
 411     # load encoding id->name mapping if it's not loaded yet
 412     LJ::load_codes({ "encoding" => \%LJ::CACHE_ENCODINGS } )
 413         unless %LJ::CACHE_ENCODINGS;
 414
 415     if ($u->{'oldenc'} == 0 ||
 416         not defined $LJ::CACHE_ENCODINGS{$u->{'oldenc'}}) {
 417         $$error = 1;
 418         return undef;
 419     };
 420
 421     # convert!
 422     my $name = $LJ::CACHE_ENCODINGS{$u->{'oldenc'}};
 423     unless (LJ::ConvUTF8->supported_charset($name)) {
 424         $$error = 1;
 425         return undef;
 426     }
 427
 428     return LJ::ConvUTF8->to_utf8($name, $text);
 429 }
 430
 431
 432 # <LJFUNC>
 433 # name: LJ::text_length
 434 # des: returns both byte length and character length of a string. In a non-Unicode
 435 #      environment, this means byte length twice. In a Unicode environment,
 436 #      the function assumes that its argument is a valid UTF-8 string.
 437 # args: text
 438 # des-text: the string to measure
 439 # returns: a list of two values, (byte_length, char_length).
 440 # </LJFUNC>
 441
 442 sub text_length
 443 {
 444     my $text = shift;
 445     my $bl = length($text);
 446     unless ($LJ::UNICODE) {
 447         return ($bl, $bl);
 448     }
 449     my $cl = 0;
 450     my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
 451
 452     while ($text =~ m/$utf_char/go) { $cl++; }
 453     return ($bl, $cl);
 454 }
 455
 456 # <LJFUNC>
 457 # name: LJ::text_trim
 458 # des: truncate string according to requirements on byte length, char
 459 #      length, or both. "char length" means number of UTF-8 characters if
 460 #      [ljconfig[unicode]] is set, or the same thing as byte length otherwise.
 461 # args: text, byte_max, char_max
 462 # des-text: the string to trim
 463 # des-byte_max: maximum allowed length in bytes; if 0, there's no restriction
 464 # des-char_max: maximum allowed length in chars; if 0, there's no restriction
 465 # returns: the truncated string.
 466 # </LJFUNC>
 467 sub text_trim
 468 {
 469     my ($text, $byte_max, $char_max) = @_;
 470     return $text unless $byte_max or $char_max;
 471     if (!$LJ::UNICODE) {
 472         $byte_max = $char_max if $char_max and $char_max < $byte_max;
 473         $byte_max = $char_max unless $byte_max;
 474         return substr($text, 0, $byte_max);
 475     }
 476     my $cur = 0;
 477     my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
 478
 479     # if we don't have a character limit, assume it's the same as the byte limit.
 480     # we will never have more characters than bytes, but we might have more bytes
 481     # than characters, so we can't inherit the other way.
 482     $char_max ||= $byte_max;
 483     $byte_max ||= ($char_max + 0) * 4;
 484
 485     while ($text =~ m/$utf_char/gco) {
 486     last unless $char_max;
 487         last if $cur + length($1) > $byte_max and $byte_max;
 488         $cur += length($1);
 489         $char_max--;
 490     }
 491     return substr($text,0,$cur);
 492 }
 493
 494 # trim string, but not truncate in middle of the word
 495 # Deprecated. Use LJ::Text->truncate_to_word_with_ellipsis
 496 sub trim_at_word
 497 {
 498     my ($text, $char_max) = @_;
 499
 500     return $text if length($text) <= $char_max;
 501
 502     $char_max -= 3; # space for '...'
 503
 504     my $short_text = text_trim($text, undef, $char_max);
 505     my $short_len  = length($short_text);
 506     my $full_len   = length($text);
 507
 508     if ($short_len < $full_len) { # really trimmed
 509         # need correct last word and add '...'
 510         my $last_char = substr($short_text, -1, 1);
 511         my $first_char = substr($text, $short_len, 1);
 512         if ($last_char ne ' ' and $first_char ne ' ') {
 513             my $space_idx = rindex($short_text, ' ');
 514             my $dot_idx   = rindex($short_text, '.');
 515             my $comma_idx = rindex($short_text, ',');
 516             my $semi_idx  = rindex($short_text, ';');
 517             my $colon_idx = rindex($short_text, ':');
 518
 519             my $max = (sort {$b <=> $a} $space_idx, $dot_idx, $comma_idx, $semi_idx, $colon_idx)[0];
 520             $short_text = substr($text, 0, $max);
 521
 522             # attention: &#2116; must not lose ';' sign
 523             if ($max == $semi_idx) {
 524                 my $one_char_longer = substr($text, 0, $max + 1);
 525                 if ($one_char_longer =~ /&.+;$/) { # entity in any form
 526                     $short_text = $one_char_longer; # we must keep in whole
 527                 }
 528             }
 529
 530             # seconde attempt to reduce text to the end of phrase
 531             return $short_text . '...' if $short_text =~ s/([.;:!?])[^\\1]{1,5}$//;
 532         }
 533     }
 534
 535     return $short_text . '...';
 536 }
 537
 538 # <LJFUNC>
 539 # name: LJ::html_trim_4gadgets
 540 # des: truncate string according to requirements on char length.
 541 # args: text, char_max
 542 # des-text: the string to trim
 543 # des-char_max: maximum allowed length in chars; if 0, there's no restriction
 544 # returns: the truncated string.
 545 # </LJFUNC>
 546
 547 # internal subs
 548
 549 sub _html_trim_4gadgets_count_chars
 550 {
 551     my ($l, $text_out_len, $max_len) = @_;
 552
 553     if ($$text_out_len + $l > $max_len) {
 554         return 0;
 555     } else {
 556         $$text_out_len += $l;
 557         return $l;
 558     }
 559 }
 560
 561 my %_html_trim_4gadgets_autoclose = map { $_ => $_ } qw(p li ol ul a strike);
 562
 563 # Subroutine itself
 564 sub html_trim_4gadgets
 565 {
 566     my $text = shift;
 567     my $max_len = shift;
 568     my $link = shift;
 569
 570     my $text_out = '';
 571     my $text_out_len = 0;
 572     my $finish = 0;
 573
 574     my @tags2autoclose = ();
 575     my $text_before_table = '';
 576
 577     # collapse all white spaces to one space.
 578     $text =~ s/\s{2,}/ /g;
 579
 580     # remove <lj-cut> ... </lj-cut>
 581     $text =~ s/\(\&nbsp;<a href="http:\/\/.+?\.html\?#cutid.+?<\/a>\&nbsp;\)//g;
 582
 583     my $clean_tag = sub {
 584         my ($type, $tag, $attr, $protected) = @_;
 585         my $ret = '';
 586
 587         $ret = "<$tag />" if 'S' eq $type && $attr->{'/'};
 588
 589         if ('S' eq $type) {
 590             my $added_attrs = '';
 591             if ($protected) {
 592                 foreach my $k (keys %$attr) {
 593                     delete $attr->{$k} unless $protected->{lc $k};
 594                 }
 595                 $added_attrs = join(' ', map { $attr->{$_} . "=\"$_\"" } keys %$attr);
 596             }
 597             $ret = "<$tag$added_attrs>";
 598             push @tags2autoclose, $tag if exists $_html_trim_4gadgets_autoclose{lc $tag};
 599         } else {
 600             pop @tags2autoclose;
 601             $ret = "</$tag>";
 602         }
 603
 604         return $ret;
 605     };
 606
 607     my $clean_table_tag = sub { $clean_tag->(@_, { map { $_ => $_ } qw(rowspan colspan) } ) };
 608
 609     my $clean_hn_tag = sub {
 610         my ($type, $tag, $attr) = @_;
 611         my $ret = '';
 612
 613         $ret = "<p /><strong />" if 'S' eq $type && $attr->{'/'};
 614
 615         if ('S' eq $type) {
 616             $ret = "<p><strong>";
 617         } else {
 618             $ret = "</p></strong>";
 619         }
 620
 621         return $ret;
 622     };
 623
 624     my %reconstruct = (
 625         img => sub {
 626             my ($type, $tag, $attr) = @_;
 627
 628             if ('S' eq $type && $attr->{'src'}) {
 629                 # <img ...> tag count as 50 chars
 630                 if (_html_trim_4gadgets_count_chars(50, \$text_out_len, $max_len)) {
 631                     return "<img src=\"".$attr->{'src'}."\" border=\"0\" />";
 632                 } else {
 633                     $finish = 1;
 634                 }
 635             }
 636             return '';
 637         },
 638
 639         a => sub {
 640             my ($type, $tag, $attr) = @_;
 641
 642             if ('S' eq $type && $attr->{'href'}) {
 643                 push @tags2autoclose, $tag;
 644                 return "<$tag href=\"".$attr->{'href'}."\" target=\"_blank\">";
 645             }
 646             if ('E' eq $type) {
 647                 pop @tags2autoclose;
 648                 return "</$tag>";
 649             }
 650             return '';
 651         },
 652
 653         p       => $clean_tag,
 654         br      => $clean_tag,
 655         wbr     => $clean_tag,
 656         li      => $clean_tag,
 657         ol      => $clean_tag,
 658         ul      => $clean_tag,
 659         s       => $clean_tag,
 660         strike  => $clean_tag,
 661
 662         table   => sub {
 663             my ($type, $tag, $attr) = @_;
 664
 665             if ('S' eq $type && $attr->{'href'}) {
 666                 $text_before_table = $text_out unless $text_before_table;
 667                 push @tags2autoclose, $tag;
 668                 return "<$tag cellpadding=\"5\" cellspacing=\"5\" border=\"0\">"
 669             }
 670             if ('E' eq $type) {
 671                 pop @tags2autoclose;
 672                 $text_before_table = '' unless grep { /table/i } @tags2autoclose;
 673                 return "</$tag>";
 674             }
 675             return '';
 676         },
 677
 678         th      => $clean_table_tag,
 679         tr      => $clean_table_tag,
 680         td      => $clean_table_tag,
 681
 682         h1      => $clean_hn_tag,
 683         h2      => $clean_hn_tag,
 684         h3      => $clean_hn_tag,
 685         h4      => $clean_hn_tag,
 686         h5      => $clean_hn_tag,
 687         h6      => $clean_hn_tag,
 688
 689         'lj-embed'  => sub {
 690             my ($type, $tag, $attr) = @_;
 691
 692             if ('S' eq $type && $attr->{'id'}) {
 693                 return "<$tag id=\"".$attr->{'id'}."\" />";
 694             }
 695
 696             return '';
 697         },
 698
 699     );
 700
 701     my $p = HTML::TokeParser->new(\$text);
 702
 703     while (my $token = $p->get_token) {
 704         my ($type, $tag, $attr) = @$token;
 705
 706         if ('T' eq $type) {
 707             if(_html_trim_4gadgets_count_chars(length($tag),\$text_out_len,$max_len)) {
 708                 $text_out .= $tag;
 709             } else {
 710                 # Try to cut $tag and add some words, not whole text.
 711                 $text_out .= LJ::trim_at_word($tag, $max_len - $text_out_len);
 712                 $text_out =~ s/\.\.\.$//; # remove last '...' added by LJ::trim_at_word()
 713                 $finish = 1;
 714             }
 715             next;
 716         }
 717
 718         if (exists($reconstruct{lc $tag})) {
 719             $text_out .= $reconstruct{$tag}->($type, $tag, $attr);
 720         }
 721
 722         # <lj-poll-n>
 723         if ($tag =~ m/^lj-poll-(\d+)$/g && 'S' eq $type) {
 724            my $pollid = $1;
 725            my $name = LJ::Poll->new($pollid)->name;
 726            if ($name) {
 727                LJ::Poll->clean_poll(\$name);
 728            } else {
 729                $name = "#$pollid";
 730            }
 731
 732            $text_out .= "<a href=\"$LJ::SITEROOT/poll/?id=$pollid\" target=\"_blank\" >View Poll: $name.</a>";
 733         }
 734
 735         last if $finish;
 736     }
 737
 738     # Close all open tags.
 739     if ($finish && @tags2autoclose) {
 740         while ($_ = pop @tags2autoclose) {
 741             if ('table' eq lc $_) {
 742                 $text_out = $text_before_table;
 743                 last;
 744             }
 745             $text_out .= "</$_>";
 746         }
 747     }
 748
 749     return $text_out . ($finish && $link ? "<a href=\"$link\">...</a>" : '');
 750 }
 751
 752 # <LJFUNC>
 753 # name: LJ::text_compress
 754 # des: Compresses a chunk of text, to gzip, if configured for site.  Can compress
 755 #      a scalarref in place, or return a compressed copy.  Won't compress if
 756 #      value is too small, already compressed, or size would grow by compressing.
 757 # args: text
 758 # des-text: either a scalar or scalarref
 759 # returns: nothing if given a scalarref (to compress in-place), or original/compressed value,
 760 #          depending on site config.
 761 # </LJFUNC>
 762 sub text_compress
 763 {
 764     my $text = shift;
 765     my $ref = ref $text;
 766     return $ref ? undef : $text unless $LJ::COMPRESS_TEXT;
 767     die "Invalid reference" if $ref && $ref ne "SCALAR";
 768
 769     my $tref = $ref ? $text : \$text;
 770     my $pre_len = length($$tref);
 771     unless (substr($$tref,0,2) eq "\037\213" || $pre_len < 100) {
 772         my $gz = Compress::Zlib::memGzip($$tref);
 773         if (length($gz) < $pre_len) {
 774             $$tref = $gz;
 775         }
 776     }
 777
 778     return $ref ? undef : $$tref;
 779 }
 780
 781 # <LJFUNC>
 782 # name: LJ::text_uncompress
 783 # des: Uncompresses a chunk of text, from gzip, if configured for site.  Can uncompress
 784 #      a scalarref in place, or return a compressed copy.  Won't uncompress unless
 785 #      it finds the gzip magic number at the beginning of the text.
 786 # args: text
 787 # des-text: either a scalar or scalarref.
 788 # returns: nothing if given a scalarref (to uncompress in-place), or original/uncompressed value,
 789 #          depending on if test was compressed or not
 790 # </LJFUNC>
 791 sub text_uncompress
 792 {
 793     my $text = shift;
 794     my $ref = ref $text;
 795     die "Invalid reference" if $ref && $ref ne "SCALAR";
 796     my $tref = $ref ? $text : \$text;
 797
 798     # check for gzip's magic number
 799     if (substr($$tref,0,2) eq "\037\213") {
 800         $$tref = Compress::Zlib::memGunzip($$tref);
 801     }
 802
 803     return $ref ? undef : $$tref;
 804 }
 805
 806 # trimm text for small widgets(myspace, yandexbar, facebook and etc.)
 807 # args:
 808 # text =>
 809 # length =>
 810 # img_length =>
 811 # return: trimmed text
 812 sub trim_widgets {
 813      my %args = @_;
 814      my $max_length = $args{length};
 815      my $img_length = $args{img_length} || 50;
 816      my @allow_tags = qw(img p li ol ul a br s strike table th tr td h1 h2 h3 h4 h5 h6 lj);
 817      my @close_tags = qw(li ol ul a s strike h1 h2 h3 h4 h5 h6);
 818
 819      my $event = '';
 820      my $event_length = 0;
 821      my $table_tags = 0;
 822      my $buff = '';
 823      my $buff_length = 0;
 824      my @tags_stack;
 825      my @parts = split /(<[^>]*>)/, $args{text};
 826      while (defined(my $slice = shift @parts)){
 827          if( my ($close_tag, $tag, $attrib) = ($slice =~ m#<(/?)\s*(\w+)(\s+[^>]*)?>#) ){
 828              next unless grep {$tag eq $_} @allow_tags;
 829              if( $close_tag ){
 830                  my $j;
 831                  for $j( 0 .. @tags_stack ){
 832                      last if $tags_stack[$j] eq $tag;
 833                  }
 834                  splice(@tags_stack, $j, 1);
 835                  $table_tags-- if ($tag eq 'table') && $table_tags;
 836              } else {
 837                  if( $tag eq 'img' ){
 838                     if ($buff_length + $event_length > $max_length - $img_length){
 839                         push @parts, $slice;
 840                         last;
 841                     };
 842                     $attrib =~ s#.*(src=['"][^'"]*['"])$#$1 /#;
 843                     $buff_length += $img_length;
 844                  } elsif( $tag eq 'a' ) {
 845                     $attrib =~ s#.*(href=['"][^'"]*['"]).*$#$1#;
 846                     $attrib = 'target="_blank" ' . $attrib;
 847                  } elsif( $tag eq 'table' ){
 848                     $attrib = 'cellpadding="5" cellspacing="5" border="0"';
 849                     $table_tags++;
 850                  } elsif( $tag =~ /t[hdr]/ ){
 851                     $attrib = join(' ', grep {$_ =~ /^(col|row)span/}
 852                         split /\s+/, $attrib);
 853                  }
 854
 855                  unshift @tags_stack, $tag
 856                     if grep {$tag eq $_} @close_tags;
 857
 858                  $slice = "<$tag" . ($attrib?" $attrib>":'>');
 859              }
 860              $slice = $close_tag?'</strong></p>':'<p><strong>' if $tag =~ /h\d/;
 861          } else {
 862             my $slice_length = LJ::text_length($slice);
 863             if ($event_length + $buff_length > $max_length - $slice_length) {
 864                 if ($table_tags){
 865                     push @parts, $slice;
 866                     last;
 867                 }
 868                 my @words = split /([\n\s]+)/, $slice;
 869                 for my $w (@words){
 870                     my $word_length = LJ::text_length($w);
 871                     if ($event_length + $word_length > $max_length){
 872                         push @parts, $slice;
 873                         last;
 874                     }
 875                     $event_length += $word_length;
 876                     $event .= $w;
 877                 }
 878                 last;
 879             }
 880             $buff_length += $slice_length;
 881          }
 882
 883          $buff .= $slice;
 884          unless( $table_tags ){
 885              $event_length += $buff_length;
 886              $event .= $buff;
 887              $buff_length = 0;
 888              $buff = '';
 889          }
 890      }
 891
 892      $event = $event . "</$_>" for @tags_stack;
 893      $event = $event . $args{'read_more'} if @parts;
 894      return $event;
 895 }
 896
 897 # event => text
 898 # embed_url => url
 899 sub convert_lj_tags_to_links {
 900     my %args = @_;
 901     while ($args{event} =~ /<lj-poll-(\d+)>/g) {
 902        my $pollid = $1;
 903        my $name = LJ::Poll->new($pollid)->name;
 904        if ($name) {
 905            LJ::Poll->clean_poll(\$name);
 906        } else {
 907            $name = "#$pollid";
 908        }
 909        my $polltext = LJ::Lang::ml('entry.ljpoll', {name=>$name});
 910        $args{event} =~ s|<lj-poll-$pollid>|<a href="$LJ::SITEROOT/poll/?id=$pollid" target="_blank" >$polltext</a>|g;
 911     }
 912
 913     my $embedtext = LJ::Lang::ml('entry.ljembed');
 914     $args{event} =~ s|<lj\-embed[^>]+/>|<a href="$args{embed_url}">$embedtext</a>|g;
 915     while ( $args{event} =~ /<lj\s+user="([^>"]+)"\s*\/?>/g ){
 916         # follow the documentation - no about communites, openid or syndicated, just user
 917         my $username = $1;
 918         if (my $user = LJ::load_user($username)) {
 919             # $username may be not equal to $name (if $username is not canonical username)
 920             my $name = $user->username;
 921             my $html = '<a href="' . $user->profile_url . '" target="_blank"><img src="'
 922                 . $LJ::IMGPREFIX . '/userinfo.gif?v=17080" alt=""></a><a href="'
 923                 . $user->journal_base . '" target="_blank">' . $name . '</a>';
 924             $args{event} =~ s|<lj\s+user="$username"\s*\/?>|$html|g;
 925         } else {
 926             $args{event} =~ s|<lj\s+user="$username"\s*\/?>|$username|g;
 927         }
 928     }
 929     $args{event} =~ s|</?lj-cut[^>]*>||g;
 930
 931     return $args{event};
 932 }
 933
 934 # function to trim a string containing HTML.  this will auto-close any
 935 # html tags that were still open when the string was truncated
 936 sub html_trim {
 937     my ($text, $char_max) = @_;
 938
 939     return $text unless $char_max;
 940
 941     my $p = HTML::TokeParser->new(\$text);
 942     my @open_tags; # keep track of what tags are open
 943     my $out = '';
 944     my $content_len = 0;
 945
 946   TOKEN:
 947     while (my $token = $p->get_token) {
 948         my $type = $token->[0];
 949         my $tag  = $token->[1];
 950         my $attr = $token->[2];  # hashref
 951
 952         if ($type eq "S") {
 953             my $selfclose;
 954
 955             # start tag
 956             $out .= "<$tag";
 957
 958             # assume tags are properly self-closed
 959             $selfclose = 1 if lc $tag eq 'input' || lc $tag eq 'br' || lc $tag eq 'img';
 960
 961             # preserve order of attributes. the original order is
 962             # in element 4 of $token
 963             foreach my $attrname (@{$token->[3]}) {
 964                 if ($attrname eq '/') {
 965                     $selfclose = 1;
 966                     next;
 967                 }
 968
 969                 # FIXME: ultra ghetto.
 970                 $attr->{$attrname} = LJ::no_utf8_flag($attr->{$attrname});
 971                 $out .= " $attrname=\"" . LJ::ehtml($attr->{$attrname}) . "\"";
 972             }
 973
 974             $out .= $selfclose ? " />" : ">";
 975
 976             push @open_tags, $tag unless $selfclose;
 977
 978         } elsif ($type eq 'T' || $type eq 'D') {
 979             my $content = $token->[1];
 980
 981             if (length($content) + $content_len > $char_max) {
 982
 983                 # truncate and stop parsing
 984                 $content = LJ::text_trim($content, undef, ($char_max - $content_len));
 985                 $out .= $content;
 986                 last;
 987             }
 988
 989             $content_len += length $content;
 990
 991             $out .= $content;
 992
 993         } elsif ($type eq 'C') {
 994             # comment, don't care
 995             $out .= $token->[1];
 996
 997         } elsif ($type eq 'E') {
 998             # end tag
 999             pop @open_tags;
1000             $out .= "</$tag>";
1001         }
1002     }
1003
1004     $out .= join("\n", map { "</$_>" } reverse @open_tags);
1005
1006     return $out;
1007 }
1008
1009 # takes a number, inserts commas where needed
1010 sub commafy {
1011     my $number = shift;
1012     return $number unless $number =~ /^\d+$/;
1013
1014     my $punc = LJ::Lang::ml('number.punctuation') || ",";
1015     $number =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/$punc/g;
1016     return $number;
1017 }
1018
1019 # <LJFUNC>
1020 # name: LJ::html_newlines
1021 # des: Replace newlines with HTML break tags.
1022 # args: text
1023 # returns: text, possibly including HTML break tags.
1024 # </LJFUNC>
1025 sub html_newlines
1026 {
1027     my $text = shift;
1028     $text =~ s/\n/<br \/>/gm;
1029
1030     return $text;
1031 }
1032
1033 # given HTML, returns an arrayref of URLs to images that are in the HTML
1034 sub html_get_img_urls {
1035     my $htmlref = shift;
1036     my %opts = @_;
1037
1038     my $exclude_site_imgs = $opts{exclude_site_imgs} || 0;
1039
1040     my @image_urls;
1041     my $p = HTML::TokeParser->new($htmlref);
1042     return []
1043         unless $p;
1044     while (my $token = $p->get_token) {
1045         if ($token->[1] eq 'img' and ref $token->[2] eq 'HASH') {
1046             my $attrs = $token->[2];
1047
1048             if ( exists $attrs->{'src'} && $attrs->{'src'} ) {
1049                 push @image_urls, $attrs->{'src'} if
1050                     ($exclude_site_imgs ? $attrs->{'src'} !~ /^$LJ::IMGPREFIX/ : 1);
1051             }
1052         }
1053     }
1054
1055     return \@image_urls;
1056 }
1057
1058 # given HTML, returns an arrayref of link URLs that are in the HTML
1059 sub html_get_link_urls {
1060     my $htmlref = shift;
1061     my %opts = @_;
1062
1063     my @link_urls;
1064     my $p = HTML::TokeParser->new($htmlref);
1065
1066     while (my $token = $p->get_token) {
1067         if ($token->[0] eq "S" && $token->[1] eq "a" and ref $token->[2] eq 'HASH') {
1068             my $attrs = $token->[2];
1069             foreach my $attr (keys %$attrs) {
1070                 push @link_urls, $attrs->{$attr} if $attr eq "href";
1071             }
1072         }
1073     }
1074
1075     return \@link_urls;
1076 }
1077
1078 sub delimited_number {
1079     my ($number, $delimiter) = @_;
1080     $delimiter ||= ' ';
1081
1082     $number =~ s/(\d{1,3})(?=(\d{3})+$)/$1$delimiter/g;
1083
1084     return $number;
1085 }
1086
1087 1;
1088