#!/usr/bin/perl use strict; use warnings; use Path::Class; use YAML; use FindBin; # how to make 103-111-HTML_2.0.0.txt # 1. get PDF from http://www2.developers.softbankmobile.co.jp/dp/tool_dl/download.php?docid=120&companyid= # 2. xdoc2txt -n 103-111-HTML_2.0.0.pdf > 103-111-HTML_2.0.0.txt # ref. http://www31.ocn.ne.jp/~h_ishida/xdoc2txt.html my $pdf_text_file = shift or die "Usage: softbank-scrape-autosjis.pl 103-111-HTML_2.0.0.txt"; my $pdf_fh =file($pdf_text_file)->openr; my %map; while (my $line = <$pdf_fh>) { chomp $line; next if $line !~ /^&#\d\d\d\d\d;\s*&#x/; my @codes = split /\s+/, $line; next if @codes != 4; my $unicode = strip_entity_ref_mark($codes[1]); my $shiftjis = $codes[3]; $map{ $unicode } = $shiftjis; } close $pdf_fh; print Dump(\%map); sub strip_entity_ref_mark { local $_ = shift; s/(^&#x|;$)//g; $_; }