« 近況 | メイン | TypeKeyのIDに使える文字 »

URI::Fetch::Decode

Plaggerのcodeを参考(ほとんどコピー)にした。
URI::FetchでとってきたコンテンツをUTF8 flaggedにする。

use URI::Fetch::Decode;
my $fetch = URI::Fetch::Decode->fetch("http://www.yahoo.co.jp/");
my $utf8 = $fetch->decode_content;


あんまりテストしてないので注意。

package URI::Fetch::Decode;

use strict;
use warnings;
use base qw(URI::Fetch);

our $VERSION = '0.01';

sub fetch{
    my $class = shift;
    my $fetch = $class->SUPER::fetch(@_);
    return unless $fetch;
    bless $fetch, "URI::Fetch::Decode::Response";
}

1;

package URI::Fetch::Decode::Response;

use strict;
use warnings;
use base qw(URI::Fetch::Response);
use Encode ();

our $Detector;

BEGIN {
    if ( eval { require Encode::Detect::Detector; 1 } ) {
        $Detector = sub { Encode::Detect::Detector::detect($_[0]) };
    } else {
        require Encode::Guess;
        $Detector = sub {
            my @guess = qw(utf-8 euc-jp shift_jis); # xxx japanese only?
            eval { Encode::Guess::guess_encoding($_[0], @guess)->name };
        };
    }
}


sub decode_content{
    my $res = shift;
    my $content = $res->content;

    return unless $content;

    # 1) get charset from HTTP Content-Type header
    my $charset = ($res->http_response->content_type =~ /charset=([\w\-]+)/)[0];

    # 2) if there's not, try META tag
    $charset ||= ( $content =~ m!<meta http-equiv="Content-Type" content=".*charset=([\w\-]+)"!i )[0];

    # 3) if there's not still, try Detector/Guess
    $charset ||= $Detector->($content);

    # 4) falls back to UTF-8
    $charset ||= 'utf-8';

    my $decoded = eval { Encode::decode($charset, $content) };

    if ($@ && $@ =~ /Unknown encoding/) {
        $charset = $Detector->($content) || 'utf-8';
        $decoded = Encode::decode($charset, $content);
    }

    $decoded;
}

1;