LJSUP-17669: Login.bml form refactoring
[livejournal.git] / cgi-bin / ljtextutil.pl
bloba825f18a99d318a94badedc2cf3981f36a2984d1
1 package LJ;
2 use strict;
3 no warnings 'uninitialized';
5 use Class::Autouse qw(
6 LJ::ConvUTF8
7 HTML::TokeParser
8 HTML::Parser
9 LJ::Text
12 use Encode;
14 # <LJFUNC>
15 # name: LJ::trim
16 # class: text
17 # des: Removes whitespace from left and right side of a string.
18 # args: string
19 # des-string: string to be trimmed
20 # returns: trimmed string
21 # </LJFUNC>
22 sub trim
24 my $a = $_[0];
25 $a =~ s/^\s+//;
26 $a =~ s/\s+$//;
27 return $a;
30 # <LJFUNC>
31 # name: LJ::decode_url_string
32 # class: web
33 # des: Parse URL-style arg/value pairs into a hash.
34 # args: buffer, hashref
35 # des-buffer: Scalar or scalarref of buffer to parse.
36 # des-hashref: Hashref to populate.
37 # returns: boolean; true.
38 # </LJFUNC>
39 sub decode_url_string
41 my $a = shift;
42 my $buffer = ref $a ? $a : \$a;
43 my $hashref = shift; # output hash
44 my $keyref = shift; # array of keys as they were found
46 my $pair;
47 my @pairs = split(/&/, $$buffer);
48 @$keyref = @pairs;
49 my ($name, $value);
50 foreach $pair (@pairs)
52 ($name, $value) = split(/=/, $pair);
53 $value =~ tr/+/ /;
54 $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
55 $name =~ tr/+/ /;
56 $name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
57 $hashref->{$name} .= $hashref->{$name} ? "\0$value" : $value;
59 return 1;
62 # args: hashref of key/values
63 # arrayref of keys in order (optional)
64 # returns: urlencoded string
65 sub encode_url_string {
66 my ($hashref, $keyref) = @_;
68 return join('&', map { LJ::eurl($_) . '=' . LJ::eurl($hashref->{$_}) }
69 (ref $keyref ? @$keyref : keys %$hashref));
72 # <LJFUNC>
73 # name: LJ::eurl
74 # class: text
75 # des: Escapes a value before it can be put in a URL. See also [func[LJ::durl]].
76 # args: string
77 # des-string: string to be escaped
78 # returns: string escaped
79 # </LJFUNC>
80 sub eurl
82 return LJ::Text->eurl(@_);
85 # <LJFUNC>
86 # name: LJ::durl
87 # class: text
88 # des: Decodes a value that's URL-escaped. See also [func[LJ::eurl]].
89 # args: string
90 # des-string: string to be decoded
91 # returns: string decoded
92 # </LJFUNC>
93 sub durl
95 return LJ::Text->durl(@_);
98 # <LJFUNC>
99 # name: LJ::exml
100 # class: text
101 # des: Escapes a value before it can be put in XML.
102 # args: string
103 # des-string: string to be escaped
104 # returns: string escaped.
105 # </LJFUNC>
106 sub exml
108 # fast path for the commmon case:
109 return $_[0] unless $_[0] =~ /[&\"\'<>\x00-\x08\x0B\x0C\x0E-\x1F]/;
110 # what are those character ranges? XML 1.0 allows:
111 # #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
113 my $a = shift;
114 $a =~ s/\&/&amp;/g;
115 $a =~ s/\"/&quot;/g;
116 $a =~ s/\'/&apos;/g;
117 $a =~ s/</&lt;/g;
118 $a =~ s/>/&gt;/g;
119 $a =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g;
120 return $a;
123 # <LJFUNC>
124 # name: LJ::ehtml
125 # class: text
126 # des: Escapes a value before it can be put in HTML.
127 # args: string
128 # des-string: string to be escaped
129 # returns: string escaped.
130 # </LJFUNC>
131 sub ehtml
133 # fast path for the commmon case:
134 return $_[0] unless $_[0] =~ /[&\"\'<>]/;
136 # this is faster than doing one substitution with a map:
137 my $a = $_[0];
138 $a =~ s/\&/&amp;/g;
139 $a =~ s/\"/&quot;/g;
140 $a =~ s/\'/&\#39;/g;
141 $a =~ s/</&lt;/g;
142 $a =~ s/>/&gt;/g;
143 return $a;
146 # <LJFUNC>
147 # name: LJ::ehtm
148 # class: text
149 # des: Escapes a value before it can be put in HTML. Modified ehtml. Don't encode & if it parth of mnemonic.
150 # args: string
151 # des-string: string to be escaped
152 # returns: string escaped.
153 # </LJFUNC>
154 sub ehtm
156 # fast path for the commmon case:
157 return $_[0] unless $_[0] =~ /[&\"\'<>]/;
159 # this is faster than doing one substitution with a map:
160 my $a = $_[0];
161 $a =~ s/\&(?!(#?\w{2,7};))/&amp;/g;
162 $a =~ s/\"/&quot;/g;
163 $a =~ s/\'/&\#39;/g;
164 $a =~ s/</&lt;/g;
165 $a =~ s/>/&gt;/g;
166 return $a;
168 *eall = \&ehtml; # old BML syntax required eall to also escape BML. not anymore.
170 # <LJFUNC>
171 # name: LJ::dhtml
172 # class: text
173 # des: Remove HTML-escaping
174 # args: string
175 # des-string: string to be un-escaped
176 # returns: string with HTML
177 # </LJFUNC>
178 sub dhtml
180 my $a = $_[0];
181 $a =~ s/&quot;/"/g;
182 $a =~ s/&\#39;/'/g;
183 $a =~ s/&apos;/'/g;
184 $a =~ s/&lt;/</g;
185 $a =~ s/&gt;/>/g;
186 $a =~ s/&amp;/&/g;
187 return $a;
190 # <LJFUNC>
191 # name: LJ::etags
192 # class: text
193 # des: Escapes < and > from a string
194 # args: string
195 # des-string: string to be escaped
196 # returns: string escaped.
197 # </LJFUNC>
198 sub etags
200 # fast path for the commmon case:
201 return $_[0] unless $_[0] =~ /[<>]/;
203 my $a = $_[0];
204 $a =~ s/</&lt;/g;
205 $a =~ s/>/&gt;/g;
206 return $a;
209 # <LJFUNC>
210 # name: LJ::ejs
211 # class: text
212 # des: Escapes a string value before it can be put in JavaScript.
213 # args: string
214 # des-string: string to be escaped
215 # returns: string escaped.
216 # </LJFUNC>
217 sub ejs
219 my $a = $_[0];
220 $a =~ s/[\"\'\\]/\\$&/g;
221 $a =~ s/&quot;/\\&quot;/g;
222 $a =~ s/\r?\n/\\n/gs;
223 $a =~ s/\r//gs;
224 return $a;
227 # <LJFUNC>
228 # name: LJ::ecvs
229 # class: text
230 # des: Escapes a string value before it can be put in .csv file.
231 # args: string
232 # des-string: string to be escaped
233 # returns: string escaped.
234 # </LJFUNC>
235 sub ecsv
237 my $a = $_[0];
238 $a =~ s/"/""/g;
239 return '"'.$a.'"';
242 # given a string, makes it into a string you can put into javascript,
243 # including protecting against closing </script> tags in the entry.
244 # does the double quotes for ya.
245 sub ejs_string {
246 my $str = ejs($_[0]);
247 $str =~ s!</script!</scri\" + \"pt!gi;
248 return "\"" . $str . "\"";
251 # changes every char in a string to %XX where XX is the hex value
252 # this is useful for passing strings to javascript through HTML, because
253 # javascript's "unescape" function expects strings in this format
254 sub ejs_all
256 my $a = $_[0];
257 $a =~ s/(.)/uc sprintf("%%%02x",ord($1))/eg;
258 return $a;
262 # strip all HTML tags from a string
263 sub strip_html {
264 my $str = shift;
265 my $opts = shift || {};
266 =head
267 $str =~ s/\<lj user\=['"]?([\w-]+)['"]?\>/$1/g; # "
268 if ($opts->{use_space}) {
269 $str =~ s/\<([^\<])+\>/ /g;
270 } else {
271 $str =~ s/\<([^\<])+\>//g;
273 return $str;
274 =cut
276 my $p = HTML::Parser->new(api_version => 3,
277 handlers => {
278 text => [sub { $_[0]->{res} .= $_[1] }, 'self, text'], # concat plain text
279 # handle tags
280 start => [sub {
281 my ($self, $tag, $attrs, $origtext) = @_;
282 if ($tag =~ /lj/i){
283 if ($opts->{'expand_lj_user_tag'}) {
284 $self->{res} .= $origtext;
285 } else {
286 $self->{res} .= $attrs->{user} || $attrs->{comm}; # <lj user="username" title=".."> -> username
288 } else {
289 $self->{res} .= ' ' if $opts->{use_space}; # for other tags add spaces if needed.
291 if ($opts->{noparse_tags}) {
292 for (@{$opts->{noparse_tags}}) {
293 $self->{res} .= $origtext if $tag eq $_;
297 'self, tagname, attr, text'
299 end => [sub {
300 my ($self, $tag, $origtext) = @_;
301 if ($opts->{noparse_tags}) {
302 for (@{$opts->{noparse_tags}}) {
303 $self->{res} .= $origtext if $tag eq $_;
306 }, 'self, tagname, text'
310 $p->parse($str);
311 $p->eof;
313 return $p->{res};
316 # <LJFUNC>
317 # name: LJ::is_ascii
318 # des: checks if text is pure ASCII.
319 # args: text
320 # des-text: text to check for being pure 7-bit ASCII text.
321 # returns: 1 if text is indeed pure 7-bit, 0 otherwise.
322 # </LJFUNC>
323 sub is_ascii {
324 my $text = shift;
325 return ($text !~ m/[^\x01-\x7f]/);
328 # <LJFUNC>
329 # name: LJ::is_utf8
330 # des: check text for UTF-8 validity.
331 # args: text
332 # des-text: text to check for UTF-8 validity
333 # returns: 1 if text is a valid UTF-8 stream, 0 otherwise.
334 # </LJFUNC>
335 sub is_utf8 {
336 my $text = shift;
338 if (LJ::are_hooks("is_utf8")) {
339 return LJ::run_hook("is_utf8", $text);
342 return 1 if Encode::is_utf8($text);
344 my $eval_res = eval { Encode::decode_utf8($text); 1 };
345 return $eval_res ? 1 : 0;
348 # <LJFUNC>
349 # name: LJ::text_out
350 # des: force outgoing text into valid UTF-8.
351 # args: text
352 # des-text: reference to text to pass to output. Text if modified in-place.
353 # returns: nothing.
354 # </LJFUNC>
355 sub text_out
357 my $rtext = shift;
359 # if we're not Unicode, do nothing
360 return unless $LJ::UNICODE;
362 # is this valid UTF-8 already?
363 return if LJ::is_utf8($$rtext);
365 # no. Blot out all non-ASCII chars
366 $$rtext =~ s/[\x00\x80-\xff]/\?/g;
367 return;
370 # <LJFUNC>
371 # name: LJ::text_in
372 # des: do appropriate checks on input text. Should be called on all
373 # user-generated text.
374 # args: text
375 # des-text: text to check
376 # returns: 1 if the text is valid, 0 if not.
377 # </LJFUNC>
378 sub text_in
380 my $text = shift;
381 return 1 unless $LJ::UNICODE;
382 if (ref ($text) eq "HASH") {
383 return ! (grep { !LJ::is_utf8($_) } values %{$text});
385 if (ref ($text) eq "ARRAY") {
386 return ! (grep { !LJ::is_utf8($_) } @{$text});
388 return LJ::is_utf8($text);
391 # <LJFUNC>
392 # name: LJ::text_convert
393 # des: convert old entries/comments to UTF-8 using user's default encoding.
394 # args: dbs?, text, u, error
395 # des-dbs: optional. Deprecated; a master/slave set of database handles.
396 # des-text: old possibly non-ASCII text to convert
397 # des-u: user hashref of the journal's owner
398 # des-error: ref to a scalar variable which is set to 1 on error
399 # (when user has no default encoding defined, but
400 # text needs to be translated).
401 # returns: converted text or undef on error
402 # </LJFUNC>
403 sub text_convert
405 &nodb;
406 my ($text, $u, $error) = @_;
408 # maybe it's pure ASCII?
409 return $text if LJ::is_ascii($text);
411 # load encoding id->name mapping if it's not loaded yet
412 LJ::load_codes({ "encoding" => \%LJ::CACHE_ENCODINGS } )
413 unless %LJ::CACHE_ENCODINGS;
415 if ($u->{'oldenc'} == 0 ||
416 not defined $LJ::CACHE_ENCODINGS{$u->{'oldenc'}}) {
417 $$error = 1;
418 return undef;
421 # convert!
422 my $name = $LJ::CACHE_ENCODINGS{$u->{'oldenc'}};
423 unless (LJ::ConvUTF8->supported_charset($name)) {
424 $$error = 1;
425 return undef;
428 return LJ::ConvUTF8->to_utf8($name, $text);
432 # <LJFUNC>
433 # name: LJ::text_length
434 # des: returns both byte length and character length of a string. In a non-Unicode
435 # environment, this means byte length twice. In a Unicode environment,
436 # the function assumes that its argument is a valid UTF-8 string.
437 # args: text
438 # des-text: the string to measure
439 # returns: a list of two values, (byte_length, char_length).
440 # </LJFUNC>
442 sub text_length
444 my $text = shift;
445 my $bl = length($text);
446 unless ($LJ::UNICODE) {
447 return ($bl, $bl);
449 my $cl = 0;
450 my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
452 while ($text =~ m/$utf_char/go) { $cl++; }
453 return ($bl, $cl);
456 # <LJFUNC>
457 # name: LJ::text_trim
458 # des: truncate string according to requirements on byte length, char
459 # length, or both. "char length" means number of UTF-8 characters if
460 # [ljconfig[unicode]] is set, or the same thing as byte length otherwise.
461 # args: text, byte_max, char_max
462 # des-text: the string to trim
463 # des-byte_max: maximum allowed length in bytes; if 0, there's no restriction
464 # des-char_max: maximum allowed length in chars; if 0, there's no restriction
465 # returns: the truncated string.
466 # </LJFUNC>
467 sub text_trim
469 my ($text, $byte_max, $char_max) = @_;
470 return $text unless $byte_max or $char_max;
471 if (!$LJ::UNICODE) {
472 $byte_max = $char_max if $char_max and $char_max < $byte_max;
473 $byte_max = $char_max unless $byte_max;
474 return substr($text, 0, $byte_max);
476 my $cur = 0;
477 my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
479 # if we don't have a character limit, assume it's the same as the byte limit.
480 # we will never have more characters than bytes, but we might have more bytes
481 # than characters, so we can't inherit the other way.
482 $char_max ||= $byte_max;
483 $byte_max ||= ($char_max + 0) * 4;
485 while ($text =~ m/$utf_char/gco) {
486 last unless $char_max;
487 last if $cur + length($1) > $byte_max and $byte_max;
488 $cur += length($1);
489 $char_max--;
491 return substr($text,0,$cur);
494 # trim string, but not truncate in middle of the word
495 # Deprecated. Use LJ::Text->truncate_to_word_with_ellipsis
496 sub trim_at_word
498 my ($text, $char_max) = @_;
500 return $text if length($text) <= $char_max;
502 $char_max -= 3; # space for '...'
504 my $short_text = text_trim($text, undef, $char_max);
505 my $short_len = length($short_text);
506 my $full_len = length($text);
508 if ($short_len < $full_len) { # really trimmed
509 # need correct last word and add '...'
510 my $last_char = substr($short_text, -1, 1);
511 my $first_char = substr($text, $short_len, 1);
512 if ($last_char ne ' ' and $first_char ne ' ') {
513 my $space_idx = rindex($short_text, ' ');
514 my $dot_idx = rindex($short_text, '.');
515 my $comma_idx = rindex($short_text, ',');
516 my $semi_idx = rindex($short_text, ';');
517 my $colon_idx = rindex($short_text, ':');
519 my $max = (sort {$b <=> $a} $space_idx, $dot_idx, $comma_idx, $semi_idx, $colon_idx)[0];
520 $short_text = substr($text, 0, $max);
522 # attention: &#2116; must not lose ';' sign
523 if ($max == $semi_idx) {
524 my $one_char_longer = substr($text, 0, $max + 1);
525 if ($one_char_longer =~ /&.+;$/) { # entity in any form
526 $short_text = $one_char_longer; # we must keep in whole
530 # seconde attempt to reduce text to the end of phrase
531 return $short_text . '...' if $short_text =~ s/([.;:!?])[^\\1]{1,5}$//;
535 return $short_text . '...';
538 # <LJFUNC>
539 # name: LJ::html_trim_4gadgets
540 # des: truncate string according to requirements on char length.
541 # args: text, char_max
542 # des-text: the string to trim
543 # des-char_max: maximum allowed length in chars; if 0, there's no restriction
544 # returns: the truncated string.
545 # </LJFUNC>
547 # internal subs
549 sub _html_trim_4gadgets_count_chars
551 my ($l, $text_out_len, $max_len) = @_;
553 if ($$text_out_len + $l > $max_len) {
554 return 0;
555 } else {
556 $$text_out_len += $l;
557 return $l;
561 my %_html_trim_4gadgets_autoclose = map { $_ => $_ } qw(p li ol ul a strike);
563 # Subroutine itself
564 sub html_trim_4gadgets
566 my $text = shift;
567 my $max_len = shift;
568 my $link = shift;
570 my $text_out = '';
571 my $text_out_len = 0;
572 my $finish = 0;
574 my @tags2autoclose = ();
575 my $text_before_table = '';
577 # collapse all white spaces to one space.
578 $text =~ s/\s{2,}/ /g;
580 # remove <lj-cut> ... </lj-cut>
581 $text =~ s/\(\&nbsp;<a href="http:\/\/.+?\.html\?#cutid.+?<\/a>\&nbsp;\)//g;
583 my $clean_tag = sub {
584 my ($type, $tag, $attr, $protected) = @_;
585 my $ret = '';
587 $ret = "<$tag />" if 'S' eq $type && $attr->{'/'};
589 if ('S' eq $type) {
590 my $added_attrs = '';
591 if ($protected) {
592 foreach my $k (keys %$attr) {
593 delete $attr->{$k} unless $protected->{lc $k};
595 $added_attrs = join(' ', map { $attr->{$_} . "=\"$_\"" } keys %$attr);
597 $ret = "<$tag$added_attrs>";
598 push @tags2autoclose, $tag if exists $_html_trim_4gadgets_autoclose{lc $tag};
599 } else {
600 pop @tags2autoclose;
601 $ret = "</$tag>";
604 return $ret;
607 my $clean_table_tag = sub { $clean_tag->(@_, { map { $_ => $_ } qw(rowspan colspan) } ) };
609 my $clean_hn_tag = sub {
610 my ($type, $tag, $attr) = @_;
611 my $ret = '';
613 $ret = "<p /><strong />" if 'S' eq $type && $attr->{'/'};
615 if ('S' eq $type) {
616 $ret = "<p><strong>";
617 } else {
618 $ret = "</p></strong>";
621 return $ret;
624 my %reconstruct = (
625 img => sub {
626 my ($type, $tag, $attr) = @_;
628 if ('S' eq $type && $attr->{'src'}) {
629 # <img ...> tag count as 50 chars
630 if (_html_trim_4gadgets_count_chars(50, \$text_out_len, $max_len)) {
631 return "<img src=\"".$attr->{'src'}."\" border=\"0\" />";
632 } else {
633 $finish = 1;
636 return '';
639 a => sub {
640 my ($type, $tag, $attr) = @_;
642 if ('S' eq $type && $attr->{'href'}) {
643 push @tags2autoclose, $tag;
644 return "<$tag href=\"".$attr->{'href'}."\" target=\"_blank\">";
646 if ('E' eq $type) {
647 pop @tags2autoclose;
648 return "</$tag>";
650 return '';
653 p => $clean_tag,
654 br => $clean_tag,
655 wbr => $clean_tag,
656 li => $clean_tag,
657 ol => $clean_tag,
658 ul => $clean_tag,
659 s => $clean_tag,
660 strike => $clean_tag,
662 table => sub {
663 my ($type, $tag, $attr) = @_;
665 if ('S' eq $type && $attr->{'href'}) {
666 $text_before_table = $text_out unless $text_before_table;
667 push @tags2autoclose, $tag;
668 return "<$tag cellpadding=\"5\" cellspacing=\"5\" border=\"0\">"
670 if ('E' eq $type) {
671 pop @tags2autoclose;
672 $text_before_table = '' unless grep { /table/i } @tags2autoclose;
673 return "</$tag>";
675 return '';
678 th => $clean_table_tag,
679 tr => $clean_table_tag,
680 td => $clean_table_tag,
682 h1 => $clean_hn_tag,
683 h2 => $clean_hn_tag,
684 h3 => $clean_hn_tag,
685 h4 => $clean_hn_tag,
686 h5 => $clean_hn_tag,
687 h6 => $clean_hn_tag,
689 'lj-embed' => sub {
690 my ($type, $tag, $attr) = @_;
692 if ('S' eq $type && $attr->{'id'}) {
693 return "<$tag id=\"".$attr->{'id'}."\" />";
696 return '';
701 my $p = HTML::TokeParser->new(\$text);
703 while (my $token = $p->get_token) {
704 my ($type, $tag, $attr) = @$token;
706 if ('T' eq $type) {
707 if(_html_trim_4gadgets_count_chars(length($tag),\$text_out_len,$max_len)) {
708 $text_out .= $tag;
709 } else {
710 # Try to cut $tag and add some words, not whole text.
711 $text_out .= LJ::trim_at_word($tag, $max_len - $text_out_len);
712 $text_out =~ s/\.\.\.$//; # remove last '...' added by LJ::trim_at_word()
713 $finish = 1;
715 next;
718 if (exists($reconstruct{lc $tag})) {
719 $text_out .= $reconstruct{$tag}->($type, $tag, $attr);
722 # <lj-poll-n>
723 if ($tag =~ m/^lj-poll-(\d+)$/g && 'S' eq $type) {
724 my $pollid = $1;
725 my $name = LJ::Poll->new($pollid)->name;
726 if ($name) {
727 LJ::Poll->clean_poll(\$name);
728 } else {
729 $name = "#$pollid";
732 $text_out .= "<a href=\"$LJ::SITEROOT/poll/?id=$pollid\" target=\"_blank\" >View Poll: $name.</a>";
735 last if $finish;
738 # Close all open tags.
739 if ($finish && @tags2autoclose) {
740 while ($_ = pop @tags2autoclose) {
741 if ('table' eq lc $_) {
742 $text_out = $text_before_table;
743 last;
745 $text_out .= "</$_>";
749 return $text_out . ($finish && $link ? "<a href=\"$link\">...</a>" : '');
752 # <LJFUNC>
753 # name: LJ::text_compress
754 # des: Compresses a chunk of text, to gzip, if configured for site. Can compress
755 # a scalarref in place, or return a compressed copy. Won't compress if
756 # value is too small, already compressed, or size would grow by compressing.
757 # args: text
758 # des-text: either a scalar or scalarref
759 # returns: nothing if given a scalarref (to compress in-place), or original/compressed value,
760 # depending on site config.
761 # </LJFUNC>
762 sub text_compress
764 my $text = shift;
765 my $ref = ref $text;
766 return $ref ? undef : $text unless $LJ::COMPRESS_TEXT;
767 die "Invalid reference" if $ref && $ref ne "SCALAR";
769 my $tref = $ref ? $text : \$text;
770 my $pre_len = length($$tref);
771 unless (substr($$tref,0,2) eq "\037\213" || $pre_len < 100) {
772 my $gz = Compress::Zlib::memGzip($$tref);
773 if (length($gz) < $pre_len) {
774 $$tref = $gz;
778 return $ref ? undef : $$tref;
781 # <LJFUNC>
782 # name: LJ::text_uncompress
783 # des: Uncompresses a chunk of text, from gzip, if configured for site. Can uncompress
784 # a scalarref in place, or return a compressed copy. Won't uncompress unless
785 # it finds the gzip magic number at the beginning of the text.
786 # args: text
787 # des-text: either a scalar or scalarref.
788 # returns: nothing if given a scalarref (to uncompress in-place), or original/uncompressed value,
789 # depending on if test was compressed or not
790 # </LJFUNC>
791 sub text_uncompress
793 my $text = shift;
794 my $ref = ref $text;
795 die "Invalid reference" if $ref && $ref ne "SCALAR";
796 my $tref = $ref ? $text : \$text;
798 # check for gzip's magic number
799 if (substr($$tref,0,2) eq "\037\213") {
800 $$tref = Compress::Zlib::memGunzip($$tref);
803 return $ref ? undef : $$tref;
806 # trimm text for small widgets(myspace, yandexbar, facebook and etc.)
807 # args:
808 # text =>
809 # length =>
810 # img_length =>
811 # return: trimmed text
812 sub trim_widgets {
813 my %args = @_;
814 my $max_length = $args{length};
815 my $img_length = $args{img_length} || 50;
816 my @allow_tags = qw(img p li ol ul a br s strike table th tr td h1 h2 h3 h4 h5 h6 lj);
817 my @close_tags = qw(li ol ul a s strike h1 h2 h3 h4 h5 h6);
819 my $event = '';
820 my $event_length = 0;
821 my $table_tags = 0;
822 my $buff = '';
823 my $buff_length = 0;
824 my @tags_stack;
825 my @parts = split /(<[^>]*>)/, $args{text};
826 while (defined(my $slice = shift @parts)){
827 if( my ($close_tag, $tag, $attrib) = ($slice =~ m#<(/?)\s*(\w+)(\s+[^>]*)?>#) ){
828 next unless grep {$tag eq $_} @allow_tags;
829 if( $close_tag ){
830 my $j;
831 for $j( 0 .. @tags_stack ){
832 last if $tags_stack[$j] eq $tag;
834 splice(@tags_stack, $j, 1);
835 $table_tags-- if ($tag eq 'table') && $table_tags;
836 } else {
837 if( $tag eq 'img' ){
838 if ($buff_length + $event_length > $max_length - $img_length){
839 push @parts, $slice;
840 last;
842 $attrib =~ s#.*(src=['"][^'"]*['"])$#$1 /#;
843 $buff_length += $img_length;
844 } elsif( $tag eq 'a' ) {
845 $attrib =~ s#.*(href=['"][^'"]*['"]).*$#$1#;
846 $attrib = 'target="_blank" ' . $attrib;
847 } elsif( $tag eq 'table' ){
848 $attrib = 'cellpadding="5" cellspacing="5" border="0"';
849 $table_tags++;
850 } elsif( $tag =~ /t[hdr]/ ){
851 $attrib = join(' ', grep {$_ =~ /^(col|row)span/}
852 split /\s+/, $attrib);
855 unshift @tags_stack, $tag
856 if grep {$tag eq $_} @close_tags;
858 $slice = "<$tag" . ($attrib?" $attrib>":'>');
860 $slice = $close_tag?'</strong></p>':'<p><strong>' if $tag =~ /h\d/;
861 } else {
862 my $slice_length = LJ::text_length($slice);
863 if ($event_length + $buff_length > $max_length - $slice_length) {
864 if ($table_tags){
865 push @parts, $slice;
866 last;
868 my @words = split /([\n\s]+)/, $slice;
869 for my $w (@words){
870 my $word_length = LJ::text_length($w);
871 if ($event_length + $word_length > $max_length){
872 push @parts, $slice;
873 last;
875 $event_length += $word_length;
876 $event .= $w;
878 last;
880 $buff_length += $slice_length;
883 $buff .= $slice;
884 unless( $table_tags ){
885 $event_length += $buff_length;
886 $event .= $buff;
887 $buff_length = 0;
888 $buff = '';
892 $event = $event . "</$_>" for @tags_stack;
893 $event = $event . $args{'read_more'} if @parts;
894 return $event;
897 # event => text
898 # embed_url => url
899 sub convert_lj_tags_to_links {
900 my %args = @_;
901 while ($args{event} =~ /<lj-poll-(\d+)>/g) {
902 my $pollid = $1;
903 my $name = LJ::Poll->new($pollid)->name;
904 if ($name) {
905 LJ::Poll->clean_poll(\$name);
906 } else {
907 $name = "#$pollid";
909 my $polltext = LJ::Lang::ml('entry.ljpoll', {name=>$name});
910 $args{event} =~ s|<lj-poll-$pollid>|<a href="$LJ::SITEROOT/poll/?id=$pollid" target="_blank" >$polltext</a>|g;
913 my $embedtext = LJ::Lang::ml('entry.ljembed');
914 $args{event} =~ s|<lj\-embed[^>]+/>|<a href="$args{embed_url}">$embedtext</a>|g;
915 while ( $args{event} =~ /<lj\s+user="([^>"]+)"\s*\/?>/g ){
916 # follow the documentation - no about communites, openid or syndicated, just user
917 my $username = $1;
918 if (my $user = LJ::load_user($username)) {
919 # $username may be not equal to $name (if $username is not canonical username)
920 my $name = $user->username;
921 my $html = '<a href="' . $user->profile_url . '" target="_blank"><img src="'
922 . $LJ::IMGPREFIX . '/userinfo.gif?v=17080" alt=""></a><a href="'
923 . $user->journal_base . '" target="_blank">' . $name . '</a>';
924 $args{event} =~ s|<lj\s+user="$username"\s*\/?>|$html|g;
925 } else {
926 $args{event} =~ s|<lj\s+user="$username"\s*\/?>|$username|g;
929 $args{event} =~ s|</?lj-cut[^>]*>||g;
931 return $args{event};
934 # function to trim a string containing HTML. this will auto-close any
935 # html tags that were still open when the string was truncated
936 sub html_trim {
937 my ($text, $char_max) = @_;
939 return $text unless $char_max;
941 my $p = HTML::TokeParser->new(\$text);
942 my @open_tags; # keep track of what tags are open
943 my $out = '';
944 my $content_len = 0;
946 TOKEN:
947 while (my $token = $p->get_token) {
948 my $type = $token->[0];
949 my $tag = $token->[1];
950 my $attr = $token->[2]; # hashref
952 if ($type eq "S") {
953 my $selfclose;
955 # start tag
956 $out .= "<$tag";
958 # assume tags are properly self-closed
959 $selfclose = 1 if lc $tag eq 'input' || lc $tag eq 'br' || lc $tag eq 'img';
961 # preserve order of attributes. the original order is
962 # in element 4 of $token
963 foreach my $attrname (@{$token->[3]}) {
964 if ($attrname eq '/') {
965 $selfclose = 1;
966 next;
969 # FIXME: ultra ghetto.
970 $attr->{$attrname} = LJ::no_utf8_flag($attr->{$attrname});
971 $out .= " $attrname=\"" . LJ::ehtml($attr->{$attrname}) . "\"";
974 $out .= $selfclose ? " />" : ">";
976 push @open_tags, $tag unless $selfclose;
978 } elsif ($type eq 'T' || $type eq 'D') {
979 my $content = $token->[1];
981 if (length($content) + $content_len > $char_max) {
983 # truncate and stop parsing
984 $content = LJ::text_trim($content, undef, ($char_max - $content_len));
985 $out .= $content;
986 last;
989 $content_len += length $content;
991 $out .= $content;
993 } elsif ($type eq 'C') {
994 # comment, don't care
995 $out .= $token->[1];
997 } elsif ($type eq 'E') {
998 # end tag
999 pop @open_tags;
1000 $out .= "</$tag>";
1004 $out .= join("\n", map { "</$_>" } reverse @open_tags);
1006 return $out;
1009 # takes a number, inserts commas where needed
1010 sub commafy {
1011 my $number = shift;
1012 return $number unless $number =~ /^\d+$/;
1014 my $punc = LJ::Lang::ml('number.punctuation') || ",";
1015 $number =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/$punc/g;
1016 return $number;
1019 # <LJFUNC>
1020 # name: LJ::html_newlines
1021 # des: Replace newlines with HTML break tags.
1022 # args: text
1023 # returns: text, possibly including HTML break tags.
1024 # </LJFUNC>
1025 sub html_newlines
1027 my $text = shift;
1028 $text =~ s/\n/<br \/>/gm;
1030 return $text;
1033 # given HTML, returns an arrayref of URLs to images that are in the HTML
1034 sub html_get_img_urls {
1035 my $htmlref = shift;
1036 my %opts = @_;
1038 my $exclude_site_imgs = $opts{exclude_site_imgs} || 0;
1040 my @image_urls;
1041 my $p = HTML::TokeParser->new($htmlref);
1042 return []
1043 unless $p;
1044 while (my $token = $p->get_token) {
1045 if ($token->[1] eq 'img' and ref $token->[2] eq 'HASH') {
1046 my $attrs = $token->[2];
1048 if ( exists $attrs->{'src'} && $attrs->{'src'} ) {
1049 push @image_urls, $attrs->{'src'} if
1050 ($exclude_site_imgs ? $attrs->{'src'} !~ /^$LJ::IMGPREFIX/ : 1);
1055 return \@image_urls;
1058 # given HTML, returns an arrayref of link URLs that are in the HTML
1059 sub html_get_link_urls {
1060 my $htmlref = shift;
1061 my %opts = @_;
1063 my @link_urls;
1064 my $p = HTML::TokeParser->new($htmlref);
1066 while (my $token = $p->get_token) {
1067 if ($token->[0] eq "S" && $token->[1] eq "a" and ref $token->[2] eq 'HASH') {
1068 my $attrs = $token->[2];
1069 foreach my $attr (keys %$attrs) {
1070 push @link_urls, $attrs->{$attr} if $attr eq "href";
1075 return \@link_urls;
1078 sub delimited_number {
1079 my ($number, $delimiter) = @_;
1080 $delimiter ||= ' ';
1082 $number =~ s/(\d{1,3})(?=(\d{3})+$)/$1$delimiter/g;
1084 return $number;