File:  [LON-CAPA] / loncom / publisher / Attic / thesaurus.pl
Revision 1.1: download - view: text, annotated - select for diffs
Sat Oct 13 23:03:17 2001 UTC (22 years, 8 months ago) by www
Branches: MAIN
CVS tags: version_0_4, stable_2002_spring, stable_2002_july, stable_2002_april, stable_2001_fall, STABLE, HEAD
Helper programs to create thesaurus
Only thesaurus.dat should be installed.

use strict;

# ---------------------- Read word file

open(IN,"rawkey.txt");
print "Opened file\n";
my $readdata='';
my %wordcount=();
my %wordindex=();
my %related=();
my $line;
while ($line=<IN>) {
    $readdata.=$line;
}
close(IN);
undef $line;

# ---------------------- Get all words and index them

my $index=0;
map {
    map {
       my $word=$_;
       if ($word) {
          $word=~tr/A-Z/a-z/;
          $wordcount{$word}++;
          unless (defined $wordindex{$word}) {
              $index++;
              $wordindex{$word}=$index;
          }
       }
    } split(/\W+/,$_);
} split(/(\n|\r)+/,$readdata);

print "Built word index\n";

# ---------------------- Find related words

map {
   my $line=$_;
   if ($line) {
      my @words=split(/\W+/,$line);
      map {
         my $word=$_;
         if ($word) {
            $word=~tr/A-Z/a-z/;
            my $twordidx=$wordindex{$word};
            if ($twordidx) {
               my %alreadyrelated=();
               if (defined $related{$twordidx}) {
                  map {
                      my ($idx,$count)=split(/\:/,$_);
                      $alreadyrelated{$idx}=$count;
                  } split(/\,/,$related{$twordidx});
               }
               map {
                   my $rword=$_;
                   $rword=~tr/A-Z/a-z/;
                   if (($rword) && ($rword ne $word)) {
                      my $rwordidx=$wordindex{$rword};
                      if (defined $alreadyrelated{$rwordidx}) {
                         $alreadyrelated{$rwordidx}++;
                      } else {
                         $alreadyrelated{$rwordidx}=1;
                      }
                   }
               } @words;
               $related{$twordidx}='';
               map {
                   $related{$twordidx}.=$_.':'.$alreadyrelated{$_}.',';
               } keys %alreadyrelated;
               chop $related{$twordidx};            
            } else {
               print "Warning! Unknown word: ".$word;
            }
         }
      } @words;
   }
} split(/(\n|\r)+/,$readdata);

print "Built hash of related words\n";

# ---------------------- Output

open(OUT,">thesaurus.dat");
map {
    my $wordidx=$wordindex{$_};
    print OUT $_.'@'.$wordidx.'@'.$wordcount{$_}.'@'.$related{$wordidx}.
          "\n";
} sort keys %wordindex;
close(OUT);

print "Wrote thesaurus file\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>