#!/usr/bin/perl # xpathsh.pl : XPath をテストする scraper みたいなもの # # Version: 0.0.1 # Author: MIYAMUKO Katusyuki # License: MIT/X # Usage: # # > perl xpathsh.pl http://coderepos.org/share/log/ # Getting http://coderepos.org/share/log/ # Parsing http://coderepos.org/share/log/ # xpath> ? # q quit # d dump HTML elements # s dump HTML sources # h xpath history # ? this help message # xpath> id("chglist")//tr[1]/td[@class="rev"] # @0.1.2.1.3.1.1.0.2 # @0.1.2.1.3 # .1.1.0.2.0 # "@9445" # " " # --> 9445 # xpath> h # id("chglist")//tr[1]/td[@class="rev"] ... 1 hits # xpath> q # use strict; use warnings; use HTML::TreeBuilder::XPath; use LWP::Simple; my @history; sub help { print " q quit\n"; print " d dump HTML elements\n"; print " s dump HTML sources\n"; print " h xpath history\n"; print " ? this help message\n"; } sub slurp { my $uri = shift; if ($uri =~ m!^https?://!) { return get($uri); } elsif (-e $uri) { open my $fh, "<", $uri or die "$uri: $!"; return join "", <$fh>; } else { die "Unsupported resource `$uri'"; } } sub to_text { my $elem = shift; if (!ref($elem)) { $elem; } elsif ($elem->isa("HTML::TreeBuilder::XPath::TextNode")) { $elem->getValue; } else { join("", map { to_text($_) } @{$elem->content}); } } sub extract_number { my $r = join("", map { to_text($_) } @_); $r =~ s/\D//g; if ($r =~ /\d/) { return $r; } } sub prompt { print "xpath> "; } sub save_history { my $xpath = shift; my $hits = shift; push @history, [$xpath, $hits] unless grep { $_ eq $xpath } @history; } sub print_nodes { my $tree = shift; my $xpath = shift; my @nodes = $tree->findnodes($xpath); if (@nodes) { map { if ($_->isa("HTML::TreeBuilder::XPath::TextNode")) { print $_->getValue; } else { print $_->dump; } } @nodes; print "\n"; print "--> ", (extract_number(@nodes) || "no number"), "\n"; save_history($xpath, scalar @nodes); } return scalar @nodes; } sub print_value { my $tree = shift; my $xpath = shift; my @val = $tree->findvalue($xpath); if (@val) { print $val[0], "\n"; save_history($xpath, scalar @val); } return scalar @val; } sub main { my $uri_or_filename = shift or die "Usage: xpathsh URI-or-filename\n"; print "Getting $uri_or_filename\n"; my $content = slurp($uri_or_filename); print "Parsing $uri_or_filename\n"; my $tree = HTML::TreeBuilder::XPath->new; $tree->parse($content); while (prompt, defined(my $in = )) { chomp($in); if ($in eq "?") { help; } elsif ($in eq "q") { exit 0; } elsif ($in eq "s") { print $content, "\n"; } elsif ($in eq "d") { print $tree->dump; } elsif ($in eq "h") { foreach my $h (@history) { printf("%-65s ... %d hits\n", $h->[0], $h->[1]); } } elsif ($in) { eval { print_nodes($tree, $in) || print_value($tree, $in); }; if ($@) { print $@, "\n"; next; } } } } main(@ARGV);