File:  [LON-CAPA] / loncom / publisher / Attic / testthesaurus.pl
Revision 1.1: download - view: text, annotated - select for diffs
Sat Oct 13 23:03:17 2001 UTC (22 years, 7 months ago) by www
Branches: MAIN
CVS tags: version_0_4, stable_2002_spring, stable_2002_july, stable_2002_april, stable_2001_fall, STABLE, HEAD
Helper programs to create thesaurus
Only thesaurus.dat should be installed.

use strict;


my @related=();
my @word=();
my @count=();
my %index=();
my $totalcount=0;
my $fuzzy=2;

# --------------------- Read thesaurus

open(IN,"thesaurus.dat");

while (my $entry=<IN>) {
    my ($tword,$tindex,$tcount,$trelated)=split(/\@/,$entry);
    $index{$tword}=$tindex;
    $word[$tindex]=$tword;
    $count[$tindex]=$tcount;
    $totalcount+=$tcount;
    $related[$tindex]=$trelated;
}

close(IN);

my $avecount=$totalcount/$#count;

print "Read thesaurus\n";
print "Average count: $avecount\n";

# --------------------- Test it
print "Input word [,fuzzyness]: ";
while (my $input=<STDIN>) {
    chomp($input);
    my ($newword,$newfuzzy)=split(/\,/,$input);
    if ($newfuzzy) { $fuzzy=$newfuzzy; }
    my $threshold=0.1/$fuzzy;
    $newword=~s/\W//g;
    $newword=~tr/A-Z/a-z/;
    print "\n\n\n--- $newword (Fuzzy: $fuzzy) ---\n";
    my $tindex=$index{$newword};
    if ($tindex) {
        if ($count[$tindex]>$avecount) {
           print "\nKEYWORD\n\n";
        } else {
           print "\nNot keyword\n\n";
        }
        my %found=();
        print "Related:\n";
        map {
# - Related word found
            my ($ridx,$rcount)=split(/\:/,$_);
# - Direct relation index
            my $directrel=$rcount/$count[$tindex];
            if ($directrel>$threshold) {
               map {
                  my ($rridx,$rrcount)=split(/\:/,$_);
                  if ($rridx==$tindex) {
# - Determine reverse relation index
                     my $revrel=$rrcount/$count[$ridx];
# - Calculate full index
                     $found{$ridx}=$directrel*$revrel;
                     if ($found{$ridx}>$threshold) {
                        map {
                            my ($rrridx,$rrrcount)=split(/\:/,$_);
                            unless ($found{$rrridx}) {
                               my $revrevrel=$rrrcount/$count[$ridx];
                               if (
                          $directrel*$revrel*$revrevrel>$threshold
                               ) {
                                  $found{$rrridx}=
                                       $directrel*$revrel*$revrevrel;
                               }
                            }
                        } split(/\,/,$related[$ridx]);
                     }
                  }
               } split(/\,/,$related[$ridx]);
            }
        } split(/\,/,$related[$tindex]);
# - Print results
        map {
           if ($found{$_}>$threshold) {
              print '  '.$word[$_].' '.$found{$_}."\n";
           }
        } sort { $found{$b}<=>$found{$a} } keys %found;
    } else {
        print "\nNot found\n\n";
    }
    print "\nInput word: ";
}


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>