#!/usr/bin/perl # benz - detects superset/subset relations between csv files # by hoge1e3 # # $0 filespec [ filespec ... ] # filespec := fileName[:colNum:[colNum ..]] # # fileName can be *.csv(comma separated) or *.txt (tab separated). # colNum is either number or A-Z, corresponding column name on Excel worksheets. # # example: # ./benz.pl students.csv:B score.csv # output: # students.csv_score.csv_.txt use strict; my %sets; my $setid=1; my $outFile=""; for my $a(@ARGV) { my ($f, @cols) = split(/:/,$a); if (@cols==0) { @cols=(0); } elsif (@cols==1 and $cols[0]=~/[A-Z]/ ) {@cols= &toInt($cols[0]); } my $set= bless {setid=>$setid, file=>$f , cols=>[@cols], colCount=>0 } ; $sets{$setid}=$set; $setid *= 2; $outFile.=$f."_"; } $outFile .=".txt"; $outFile =~ s#/#__#g; my %records; # recid -> setid -> [@rec] for my $setid(keys %sets) { my $set=$sets{$setid}; $set->read; } open OUT,">$outFile" or die "Cannot open $outFile"; for my $recid(sort keys %records) { my $rec = $records{$recid}; my $bits=0; my @buf; for my $setid(sort keys %sets) { my $set = $sets{$setid}; $bits |= $set->{setid} if ($rec->{$setid}) ; push @buf, join ("\t", &pad( $set->{colCount}, @{$rec->{$setid}} ) ) ; } print OUT "$bits\t".join("\t",@buf)."\n"; } close OUT; sub read { my $t=shift; my $csv; $csv=1 if ($t->{file} =~ /csv$/); open IN ,$t->{file} or die("Not found $t->{file}"); while () { s/\n//; s/\r//; my @rec; if ($csv) {@rec=&parseExcelLine($_);} else {@rec=split (/\t/);} if (@rec>$t->{colCount}) {$t->{colCount}=@rec;} my @ids; for my $col(@{$t->{cols}}) { push @ids, $rec[$col]; } my $recid=join("\t",@ids); my $rec=$records{$recid}; if (not $rec) { $rec={}; $records{$recid}=$rec; } if ($rec->{$t->{setid}}) {print STDERR "duplicate: $t->{file} - $recid\n"; } $rec->{$t->{setid}}= [@rec] ; } close IN; } sub pad { my ($num, @rec)=@_; while (@rec<$num) { push @rec,""; } @rec; } sub parseExcelLine { my $l=shift; $l =~ s/\n//; $l =~ s/\r//; $l .= ","; my @res=(); while ($l) { if ($l =~ s/^""// ) { push @res,""; } elsif ($l =~ s/^"// ) { if ($l =~ /([^"])",/) { my $it=$`.$1; $l = ",$'"; $it =~ s/""/"/g; #$it =~ s/"(?=")//g; push @res,$it; } else { print STDOUT "excelerror : unterminated $l\n"; } } else { $l =~ s/^[^,]+//; push @res,$&; } $l =~ s/^,//; } @res; } sub toInt { my $cols=shift; my @res; my @cola=unpack("C*",$cols); for my $a (@cola) { push @res, $a-65; } @res; }