use strict; # ---------------------- Read word file open(IN,"rawkey.txt"); print "Opened file\n"; my $readdata=''; my %wordcount=(); my %wordindex=(); my %related=(); my $line; while ($line=) { $readdata.=$line; } close(IN); undef $line; # ---------------------- Get all words and index them my $index=0; map { map { my $word=$_; if ($word) { $word=~tr/A-Z/a-z/; $wordcount{$word}++; unless (defined $wordindex{$word}) { $index++; $wordindex{$word}=$index; } } } split(/\W+/,$_); } split(/(\n|\r)+/,$readdata); print "Built word index\n"; # ---------------------- Find related words map { my $line=$_; if ($line) { my @words=split(/\W+/,$line); map { my $word=$_; if ($word) { $word=~tr/A-Z/a-z/; my $twordidx=$wordindex{$word}; if ($twordidx) { my %alreadyrelated=(); if (defined $related{$twordidx}) { map { my ($idx,$count)=split(/\:/,$_); $alreadyrelated{$idx}=$count; } split(/\,/,$related{$twordidx}); } map { my $rword=$_; $rword=~tr/A-Z/a-z/; if (($rword) && ($rword ne $word)) { my $rwordidx=$wordindex{$rword}; if (defined $alreadyrelated{$rwordidx}) { $alreadyrelated{$rwordidx}++; } else { $alreadyrelated{$rwordidx}=1; } } } @words; $related{$twordidx}=''; map { $related{$twordidx}.=$_.':'.$alreadyrelated{$_}.','; } keys %alreadyrelated; chop $related{$twordidx}; } else { print "Warning! Unknown word: ".$word; } } } @words; } } split(/(\n|\r)+/,$readdata); print "Built hash of related words\n"; # ---------------------- Output open(OUT,">thesaurus.dat"); map { my $wordidx=$wordindex{$_}; print OUT $_.'@'.$wordidx.'@'.$wordcount{$_}.'@'.$related{$wordidx}. "\n"; } sort keys %wordindex; close(OUT); print "Wrote thesaurus file\n";