#!/usr/bin/perl # WebScraperHelper : Helper GUI for Web::Scraper # # Description: tool what can try to select your XPath in GUI. # Version: 0.0.1 # Author: Yasuhiro Matsumoto # License: GPLv2 use strict; use Gtk2 qw/-init/; use LWP::UserAgent; use HTML::TreeBuilder::XPath; use HTML::Selector::XPath; use URI; use Encode; use HTTP::Response::Encoding; use List::Util qw(first); my $tree; my $user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'; my $vbox = Gtk2::VBox->new(0, 10); #--------------------------------------------- # URL my $url_hbox = Gtk2::HBox->new; my $url_label = Gtk2::Label->new('URL:'); my $url_text = Gtk2::Entry->new; my $url_update = Gtk2::Button->new('_Get'); $url_label->set_size_request (50, -1); $url_label->set_alignment (1, 0.5); $url_hbox->pack_start ($url_label, 0, 0, 0); $url_hbox->add ($url_text); $url_hbox->add ($url_update); $vbox->pack_start ($url_hbox, 0, 0, 0); #--------------------------------------------- # XPath my $xpath_hbox = Gtk2::HBox->new; my $xpath_label = Gtk2::Label->new('XPath:'); my $xpath_text = Gtk2::Entry->new; my $xpath_update = Gtk2::Button->new('_Update'); $xpath_label->set_size_request (50, -1); $xpath_label->set_alignment (1, 0.5); $xpath_hbox->pack_start ($xpath_label, 0, 0, 0); $xpath_hbox->add ($xpath_text); $xpath_hbox->add ($xpath_update); $vbox->pack_start ($xpath_hbox, 0, 0, 0); #--------------------------------------------- # Source View my $result_view = Gtk2::TextView->new; $result_view->set_editable (0); $result_view->set_wrap_mode ('word-char'); my $scrolled_window = Gtk2::ScrolledWindow->new; $scrolled_window->set_policy ('automatic', 'automatic'); $scrolled_window->add_with_viewport ($result_view); $vbox->pack_start ($scrolled_window, 1, 1, 0); $xpath_text->sensitive (0); $xpath_update->sensitive (0); #--------------------------------------------- # Top Level Window my $window = Gtk2::Window->new('toplevel'); $window->set_title ('WebScraperHelper'); $window->signal_connect ( 'destroy' => sub { Gtk2->main_quit } ); $window->add ($vbox); $window->set_border_width (5); $window->set_size_request (500, 500); #--------------------------------------------- # Event Handler sub get_html { my $url = $url_text->get_text (); my $ua = LWP::UserAgent->new(agent => $user_agent); my $res = $ua->get( URI->new($url) ); my $html; if ($res->is_success) { eval { my @encoding = ( $res->encoding, ($res->header('Content-Type') =~ /charset=([\w\-]+)/g), 'latin-1', ); my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding; $html = Encode::decode($encoding, $res->content); $tree->delete if $tree; $tree = HTML::TreeBuilder::XPath->new; $tree->parse($html); $tree->eof; }; } $xpath_text->sensitive ($html ? 1 : 0); $xpath_update->sensitive ($html ? 1 : 0); $result_view->get_buffer ()->set_text ( $html ); 0; } sub select_node { my $html_cut; my $exp = $xpath_text->get_text(); if ($exp) { my @nodes; eval { my $xpath = $exp =~ m!^/! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp); @nodes = eval { local $SIG{__WARN__} = sub { }; $tree->findnodes($xpath); }; }; $html_cut = ''; for my $node (@nodes) { if (ref($node) eq 'HTML::TreeBuilder::XPath::Attribute') { $html_cut .= $node->toString; } else { if ($node->isTextNode) { $html_cut .= HTML::Entities::encode($node->as_XML, q("'<>&)); } else { $html_cut .= $node->as_XML; } } } } else { $html_cut = $tree->as_XML; } $result_view->get_buffer ()->set_text ( $html_cut ); $xpath_text->grab_focus; 0; } sub xpath_key_pressed { my ($this, $key) = @_; if ($key->state->['control-mask'] and $key->keyval eq 114) { select_node; } 0; } $url_update->signal_connect ( 'clicked' => *get_html ); $xpath_update->signal_connect ( 'clicked' => *select_node ); $xpath_text->signal_connect ( 'key-press-event' => *xpath_key_pressed ); #--------------------------------------------- $window->show_all; eval { use Getopt::Long; GetOptions( 'ua=s' => \$user_agent ); }; if (@ARGV) { $url_text->set_text($ARGV[0]); get_html; } Gtk2->main;