From bc311cda51a2b3c4b9b8fea5a257bca6c5b70c28 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Mon, 13 Apr 2015 19:46:46 -0700 Subject: [PATCH] gitweb: improve blob_plain charset When gitweb returns blob_plain data it doesn't really do a very good job returning the correct charset value. Update the logic so that when returning a text/... type the charset will be set correctly in most cases based on the first portion of the content. Signed-off-by: Kyle J. McKay --- gitweb/gitweb.perl | 77 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/gitweb/gitweb.perl b/gitweb/gitweb.perl index faac5e5a12..e6e8678904 100755 --- a/gitweb/gitweb.perl +++ b/gitweb/gitweb.perl @@ -165,7 +165,7 @@ our $strict_export = "++GITWEB_STRICT_EXPORT++"; our @git_base_url_list = grep { $_ ne '' } ("++GITWEB_BASE_URL++"); # default blob_plain mimetype and default charset for text/plain blob -our $default_blob_plain_mimetype = 'text/plain'; +our $default_blob_plain_mimetype = 'application/octet-stream'; our $default_text_plain_charset = undef; # file to use for guessing MIME types before trying /etc/mime.types @@ -3908,39 +3908,65 @@ sub mimetype_guess { sub blob_mimetype { my $fd = shift; my $filename = shift; + my $mime; + + $mime = mimetype_guess($filename) if defined $filename; - if ($filename) { - my $mime = mimetype_guess($filename); - $mime and return $mime; + if (!$mime && $filename) { + if ($filename =~ m/\.html?$/i) { + $mime = 'text/html'; + } elsif ($filename =~ m/\.xht(?:ml)?$/i) { + $mime = 'text/html'; + } elsif ($filename =~ m/\.te?xt?$/i) { + $mime = 'text/plain'; + } elsif ($filename =~ m/\.(?:markdown|md)$/i) { + $mime = 'text/plain'; + } elsif ($filename =~ m/\.png$/i) { + $mime = 'image/png'; + } elsif ($filename =~ m/\.gif$/i) { + $mime = 'image/gif'; + } elsif ($filename =~ m/\.jpe?g$/i) { + $mime = 'image/jpeg'; + } elsif ($filename =~ m/\.svgz?$/i) { + $mime = 'image/svg+xml'; + } } # just in case - return $default_blob_plain_mimetype unless $fd; - - if (-T $fd) { - return 'text/plain'; - } elsif (! $filename) { - return 'application/octet-stream'; - } elsif ($filename =~ m/\.png$/i) { - return 'image/png'; - } elsif ($filename =~ m/\.gif$/i) { - return 'image/gif'; - } elsif ($filename =~ m/\.jpe?g$/i) { - return 'image/jpeg'; - } else { - return 'application/octet-stream'; - } + return $default_blob_plain_mimetype unless $fd || $mime; + + $mime = -T $fd ? 'text/plain' : 'application/octet-stream' unless $mime; + + return $mime; +} + +sub is_ascii { + use bytes; + my $data = shift; + return scalar($data =~ /^[\x00-\x7f]*$/); +} + +sub is_valid_utf8 { + my $data = shift; + return utf8::decode($data); } sub blob_contenttype { my ($fd, $file_name, $type) = @_; + my $leader; $type ||= blob_mimetype($fd, $file_name); - if ($type eq 'text/plain' && defined $default_text_plain_charset) { - $type .= "; charset=$default_text_plain_charset"; + return $type unless $type =~ m!^text/.+!; + if ($fd && read($fd, $leader, 16384)) { + return ("$type; charset=US-ASCII", $leader) if is_ascii($leader); + return ("$type; charset=UTF-8", $leader) if is_valid_utf8($leader); + return ("$type; charset=ISO-8859-1", $leader) + if defined $default_text_plain_charset && + $default_text_plain_charset =~ /utf-?8/i; } - - return $type; + return ("$type; charset=$default_text_plain_charset", $leader) + if defined $default_text_plain_charset; + return ($type, $leader); } # guess file syntax for syntax highlighting; return undef if no highlighting @@ -7023,7 +7049,8 @@ sub git_blob_plain { binmode($fd); # content-type (can include charset) - $type = blob_contenttype($fd, $file_name, $type); + my $leader; + ($type, $leader) = blob_contenttype($fd, $file_name, $type); # "save as" filename, even when no $file_name is given my $save_as = "$hash"; @@ -7058,6 +7085,7 @@ sub git_blob_plain { ($sandbox ? 'attachment' : 'inline') . '; filename="' . $save_as . '"'); binmode STDOUT, ':raw'; + print $leader if defined $leader; my $buf; while (read($fd, $buf, 32768)) { print $buf; @@ -7085,6 +7113,7 @@ sub git_blob { my $have_blame = gitweb_check_feature('blame'); defined(my $fd = git_cmd_pipe "cat-file", "blob", $hash) or die_error(500, "Couldn't cat $file_name, $hash"); + binmode($fd); my $mimetype = blob_mimetype($fd, $file_name); # use 'blob_plain' (aka 'raw') view for files that cannot be displayed if ($mimetype !~ m!^(?:text/|image/(?:gif|png|jpeg)$)! && -B $fd) { -- 2.11.4.GIT