URI::Fetch::Decode
Plaggerのcodeを参考(ほとんどコピー)にした。
URI::FetchでとってきたコンテンツをUTF8 flaggedにする。
use URI::Fetch::Decode; my $fetch = URI::Fetch::Decode->fetch("http://www.yahoo.co.jp/"); my $utf8 = $fetch->decode_content;
あんまりテストしてないので注意。
package URI::Fetch::Decode; use strict; use warnings; use base qw(URI::Fetch); our $VERSION = '0.01'; sub fetch{ my $class = shift; my $fetch = $class->SUPER::fetch(@_); return unless $fetch; bless $fetch, "URI::Fetch::Decode::Response"; } 1; package URI::Fetch::Decode::Response; use strict; use warnings; use base qw(URI::Fetch::Response); use Encode (); our $Detector; BEGIN { if ( eval { require Encode::Detect::Detector; 1 } ) { $Detector = sub { Encode::Detect::Detector::detect($_[0]) }; } else { require Encode::Guess; $Detector = sub { my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only? eval { Encode::Guess::guess_encoding($_[0], @guess)->name }; }; } } sub decode_content{ my $res = shift; my $content = $res->content; return unless $content; # 1) get charset from HTTP Content-Type header my $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0]; # 2) if there's not, try META tag $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0]; # 3) if there's not still, try Detector/Guess $charset ||= $Detector->($content); # 4) falls back to UTF-8 $charset ||= 'utf-8'; my $decoded = eval { Encode::decode($charset, $content) }; if ($@ && $@ =~ /Unknown encoding/) { $charset = $Detector->($content) || 'utf-8'; $decoded = Encode::decode($charset, $content); } $decoded; } 1;