URI::Fetch::Decode
Plaggerのcodeを参考(ほとんどコピー)にした。
URI::FetchでとってきたコンテンツをUTF8 flaggedにする。
use URI::Fetch::Decode;
my $fetch = URI::Fetch::Decode->fetch("http://www.yahoo.co.jp/");
my $utf8 = $fetch->decode_content;
あんまりテストしてないので注意。
package URI::Fetch::Decode;
use strict;
use warnings;
use base qw(URI::Fetch);
our $VERSION = '0.01';
sub fetch{
my $class = shift;
my $fetch = $class->SUPER::fetch(@_);
return unless $fetch;
bless $fetch, "URI::Fetch::Decode::Response";
}
1;
package URI::Fetch::Decode::Response;
use strict;
use warnings;
use base qw(URI::Fetch::Response);
use Encode ();
our $Detector;
BEGIN {
if ( eval { require Encode::Detect::Detector; 1 } ) {
$Detector = sub { Encode::Detect::Detector::detect($_[0]) };
} else {
require Encode::Guess;
$Detector = sub {
my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only?
eval { Encode::Guess::guess_encoding($_[0], @guess)->name };
};
}
}
sub decode_content{
my $res = shift;
my $content = $res->content;
return unless $content;
# 1) get charset from HTTP Content-Type header
my $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0];
# 2) if there's not, try META tag
$charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0];
# 3) if there's not still, try Detector/Guess
$charset ||= $Detector->($content);
# 4) falls back to UTF-8
$charset ||= 'utf-8';
my $decoded = eval { Encode::decode($charset, $content) };
if ($@ && $@ =~ /Unknown encoding/) {
$charset = $Detector->($content) || 'utf-8';
$decoded = Encode::decode($charset, $content);
}
$decoded;
}
1;