use strict; my @related=(); my @word=(); my @count=(); my %index=(); my $totalcount=0; my $fuzzy=2; # --------------------- Read thesaurus open(IN,"thesaurus.dat"); while (my $entry=) { my ($tword,$tindex,$tcount,$trelated)=split(/\@/,$entry); $index{$tword}=$tindex; $word[$tindex]=$tword; $count[$tindex]=$tcount; $totalcount+=$tcount; $related[$tindex]=$trelated; } close(IN); my $avecount=$totalcount/$#count; print "Read thesaurus\n"; print "Average count: $avecount\n"; # --------------------- Test it print "Input word [,fuzzyness]: "; while (my $input=) { chomp($input); my ($newword,$newfuzzy)=split(/\,/,$input); if ($newfuzzy) { $fuzzy=$newfuzzy; } my $threshold=0.1/$fuzzy; $newword=~s/\W//g; $newword=~tr/A-Z/a-z/; print "\n\n\n--- $newword (Fuzzy: $fuzzy) ---\n"; my $tindex=$index{$newword}; if ($tindex) { if ($count[$tindex]>$avecount) { print "\nKEYWORD\n\n"; } else { print "\nNot keyword\n\n"; } my %found=(); print "Related:\n"; map { # - Related word found my ($ridx,$rcount)=split(/\:/,$_); # - Direct relation index my $directrel=$rcount/$count[$tindex]; if ($directrel>$threshold) { map { my ($rridx,$rrcount)=split(/\:/,$_); if ($rridx==$tindex) { # - Determine reverse relation index my $revrel=$rrcount/$count[$ridx]; # - Calculate full index $found{$ridx}=$directrel*$revrel; if ($found{$ridx}>$threshold) { map { my ($rrridx,$rrrcount)=split(/\:/,$_); unless ($found{$rrridx}) { my $revrevrel=$rrrcount/$count[$ridx]; if ( $directrel*$revrel*$revrevrel>$threshold ) { $found{$rrridx}= $directrel*$revrel*$revrevrel; } } } split(/\,/,$related[$ridx]); } } } split(/\,/,$related[$ridx]); } } split(/\,/,$related[$tindex]); # - Print results map { if ($found{$_}>$threshold) { print ' '.$word[$_].' '.$found{$_}."\n"; } } sort { $found{$b}<=>$found{$a} } keys %found; } else { print "\nNot found\n\n"; } print "\nInput word: "; }