#!/usr/bin/perl # DF値を用いて関連語ランキングを作る # # -d FILE_NAME : df file (generated by ext-wpj-words.pl) # -p FILE_NAME : [optional] person name file (generated by ext-wpj-person.pl) # このファイルに登録されているキーワード以外は出力しないモード。 # stdin : keyword tsv file (generated by ext-wpj-words.pl) # # Usage: # mkrel-wpj.pl -d wj-word.df wj-word.txt > wj-rel.tsv # mkrel-wpj.pl -d wj-word.df -p wj-person.txt wj-word.txt > wj-person-rel.tsv use strict; use warnings; use Encode; use Getopt::Long; use utf8; use open ':utf8'; binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; my $df_fn = "tmp.df"; my $ps_fn = ""; GetOptions( 'df=s' => \$df_fn, 'p=s' => \$ps_fn, ); ### reading df file my %df; open(F, "<", $df_fn) or die "can't open '$df_fn'"; while () { $df{$1} = $2 if (/^(.+?)\t(\d+)$/); } close F; #print join("\n", map {"$_\t$df{$_}"} sort keys %df), "\n"; ### reading person name file (optional) my %person; if ($ps_fn ne "" and open(F, "<", $ps_fn)) { while () { chomp; $person{$_} = $df{$_} if $df{$_}; } close F; } #print join("\n", map {"$_\t$person{$_}"} sort keys %person), "\n"; ### processing keyword tsv $| = 1; while (<>) { chomp; my @cs = split(/\t/, $_); next if ($ps_fn ne "" and not exists $person{$cs[0]}); my %relwords; for (my $i = 1; $i < @cs; $i++) { my $d = $df{$cs[$i]}; next unless $d; next if ($ps_fn ne "" and not exists $person{$cs[$i]}); $relwords{$cs[$i]} = $d; } next unless %relwords; print "$cs[0]\t", join("\t", map {"$_:$relwords{$_}"} sort {$relwords{$a} <=> $relwords{$b}} keys %relwords), "\n"; }