#!/usr/bin/perl
use strict;
use warnings;
use HTML::Selector::XPath 0.03;
use Web::Scraper;
use URI;
use YAML;
my @url = (
URI->new("http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/basic/index.html"),
URI->new("http://www.nttdocomo.co.jp/english/service/imode/make/content/pictograph/basic/index.html"),
URI->new("http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/extention/index.html"),
URI->new("http://www.nttdocomo.co.jp/english/service/imode/make/content/pictograph/extention/index.html"),
);
my $res;
my $i;
my @prev;
for my $uri (@url) {
my $scraper = scraper {
process 'tr', 'characters[]', scraper {
process 'td:nth-child(1)', 'number', 'TEXT';
process 'td:nth-child(2) > img', 'image', [ '@src', sub { $_->as_string } ];
process 'td:nth-child(3)', 'sjis', 'TEXT';
process 'td:nth-child(5)', 'unicode', 'TEXT';
process 'td:nth-child(6)', 'name', 'TEXT';
};
};
my @chars = @{ $scraper->scrape($uri)->{characters} };
# remove headers
shift @chars; shift @chars;
if (++$i % 2) {
@prev = @chars;
} else {
@prev == @chars or die "ja/en count doesn't match";
for my $c (0..$#prev) {
$prev[$c]->{name_en} = $chars[$c]->{name};
}
push @$res, @prev;
}
}
binmode STDOUT, ":utf8";
print Dump($res);