File:  [LON-CAPA] / loncom / thesaurus / build_thesaurus_db.pl
Revision 1.1: download - view: text, annotated - select for diffs
Thu Jul 11 20:48:31 2002 UTC (21 years, 10 months ago) by matthew
Branches: MAIN
CVS tags: version_1_0_3, version_1_0_2, version_1_0_1, version_1_0_0, version_0_99_5, version_0_99_4, version_0_99_3, version_0_99_2, version_0_99_1, version_0_99_0, version_0_6_2, version_0_6, version_0_5_1, version_0_5, conference_2003, HEAD
Script to build LON-CAPA thesaurus database.

    1: #!/usr/bin/perl -w
    2: #
    3: # $Id: build_thesaurus_db.pl,v 1.1 2002/07/11 20:48:31 matthew Exp $
    4: #
    5: #
    6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
    7: #
    8: # Copyright Michigan State University Board of Trustees
    9: #
   10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
   11: #
   12: # LON-CAPA is free software; you can redistribute it and/or modify
   13: # it under the terms of the GNU General Public License as published by
   14: # the Free Software Foundation; either version 2 of the License, or
   15: # (at your option) any later version.
   16: #
   17: # LON-CAPA is distributed in the hope that it will be useful,
   18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   20: # GNU General Public License for more details.
   21: #
   22: # You should have received a copy of the GNU General Public License
   23: # along with LON-CAPA; if not, write to the Free Software
   24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   25: #
   26: # /home/httpd/html/adm/gpl.txt
   27: #
   28: # http://www.lon-capa.org/
   29: #
   30: use strict;
   31: use Getopt::Long;
   32: use GDBM_File;
   33: # POD required stuff:
   34: 
   35: =pod
   36: 
   37: =head1 NAME
   38: 
   39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
   40: 
   41: =head1 SYNOPSIS
   42: 
   43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
   44: 
   45: =head1 DESCRIPTION
   46: 
   47: build_thesaurus_db.pl reads two input files.  The first is a list of words to
   48: omit from the thesaurus.  The second is the raw keyword data for the thesaurus.
   49: From this file a database is built.
   50: 
   51: =head1 DATABASE FORMAT DESCRIPTION
   52: 
   53: The structure of the database entries is described below.  
   54: 
   55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   56: 
   57: Allow me to repeat myself:
   58: 
   59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   60: 
   61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   62: 
   63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   64: 
   65: Got it?  While you are reading this, let me encourage you to document
   66: any changes to the structure of the database.  It is not that hard and
   67: you will save much time if you do.  
   68: 
   69: That said, you should make sure the description below actually matches
   70: the code, just to be safe.
   71: 
   72: This concludes the lecture portion of the comments.
   73: 
   74: =head1 DATABASE FORMAT DESCRIPTION
   75: 
   76: An entry in the database for a given word is shown below:
   77: 
   78:  polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
   79:               |   |  |
   80:               |   |  The number of times dna appeared in a keywords list
   81:               |   |  with the word polymerase. 
   82:               |   The related keyword
   83:               The number of times polymerase appeared in a keywords list.
   84: 
   85: Note: the related words list will be in descending order of occurance with 
   86: the keyword.
   87: 
   88: =head1 COMMAND LINE OPTIONS
   89: 
   90: =over 4
   91: 
   92: 
   93: =item --badwordfile <filename>
   94: 
   95: filename must contain a list of words not to put in the thesaurus.  
   96: Each word must appear on its own line.
   97: Currently comments are not supported.
   98: 
   99: =item --keywordfile <filename>
  100: 
  101: File containing the raw word data for the thesaurus.  Each line must be 
  102: comma seperated list of related keywords.
  103: 
  104: =item --outputdb <filename>
  105: 
  106: file to write the LON-CAPA thesaurus database to.
  107: 
  108: =item --help
  109: 
  110: Display this help message and exit.
  111: 
  112: =item --test
  113: 
  114: Run a few test lookups after writing the database.
  115: 
  116: =back
  117: 
  118: The following example shows the default values for each parameter
  119: 
  120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
  121: 
  122: =cut
  123: 
  124: ##
  125: ## Get command line parameters
  126: ##
  127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$test);
  128: GetOptions( "badwordfile=s" => \$badwordfile,   # --badwordfile
  129:             "outputdb=s"    => \$outputdbfile,  # --outputdb
  130:             "keywordfile=s" => \$keywordfile,   # --keywordfile
  131:             "help"          => \$help,          # --help
  132:             "test"          => \$test);         # --test
  133: 
  134: ##
  135: ## Help! Help!
  136: ##
  137: if ($help) {
  138:     print <<ENDHELP;
  139: build_thesaurus_db.pl     Build a LON-CAPA thesaurus database.
  140: 
  141: Command line arguements
  142:    --badwordfile <filename>     filename must contain a list of words not to
  143:                                 put in the thesaurus.  Each word must appear
  144:                                 on its own line and currently comments are not
  145:                                 supported.
  146:    --keywordfile <filename>     File containing the raw word data for the
  147:                                 thesaurus.  Each line must be comma seperated
  148:                                 list of related keywords.
  149:    --outputdb <filename>        file to write the LON-CAPA thesaurus database
  150:                                 to.
  151:    --help                       Display this help message and exit.
  152:    --test                       Run a few test lookups after writing the 
  153:                                 database.
  154: The following example shows the default values for each parameter
  155: 
  156: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
  157:      --outputdb ./thesaurus.db --keywordfile rawkey.txt
  158: 
  159: ENDHELP
  160:     exit;
  161: }
  162: 
  163: ##
  164: ## Set up defaults for parameters and check validity
  165: ##
  166: $badwordfile  = $badwordfile  || "./un_keyword.tab";
  167: $outputdbfile = $outputdbfile || "./thesaurus.db";
  168: $keywordfile  = $keywordfile  || "./rawkey.txt";
  169: 
  170: foreach my $file ($badwordfile,$keywordfile) {
  171:     die "$file does not exist." if (! -e $file);
  172: }
  173: 
  174: ##
  175: ## Global hashes.
  176: ##
  177: my %wordcount = ();    # Holds the number of times each word appears in the
  178:                        # input file.
  179: my %related_words=();  # Holds the words related to a word.  The keys of this
  180:                        # has are words, and the values are pointers to hashes
  181:                        # which hold the words and their frequencies.
  182: my %isbad;             # Holds an entry for each keyword that is 'bad'
  183: 
  184: ##
  185: ## Initialize hash of bad words.  'bad' meaning their appearance in a keyword
  186: ## list does not add information.  Not 'bad' meaning profane.  
  187: ##
  188: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
  189: while (<BAD>) {
  190:     chomp;
  191:     $isbad{lc($_)}++;
  192: }
  193: close BAD;
  194: 
  195: ##
  196: ## Read in the data file and construction related words hash.  Skip bad words.
  197: ##
  198: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
  199: while (<IN>) {
  200:     chomp;
  201:     my @Words = split(/\W+/,lc($_));
  202:     foreach my $keyword (@Words) {
  203:         next if ($isbad{$keyword});
  204:         $wordcount{$keyword}++;
  205:         foreach my $otherword (@Words) {
  206:             next if (($otherword eq $keyword) || ($isbad{$otherword}));
  207:             $related_words{$keyword}->{$otherword}++;
  208:         }
  209:     }
  210: }
  211: close(IN);
  212: 
  213: ##
  214: ## Determine average number of entries
  215: ##
  216: my $totalcount;
  217: foreach (keys(%wordcount)) {
  218:     $totalcount+=$wordcount{$_};
  219: }
  220: my $avecount = $totalcount /(scalar keys(%wordcount));
  221: 
  222: ##
  223: ## Make sure we can write the database.
  224: ##
  225: if (-e $outputdbfile) {
  226:     die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
  227: }
  228: my %thesaurus_db;
  229: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
  230:     die "Error opening DB file.\n";
  231: }
  232: 
  233: ##
  234: ## Write the database file
  235: ##
  236: foreach my $word (keys(%related_words)) {
  237:     next if (! defined($word));
  238:     my $result = &get_related($word);
  239:     $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
  240: }
  241: 
  242: ##
  243: ## Store away special values (must contain characters not matched by \w)
  244: ##
  245: $thesaurus_db{'average.count'}=$avecount;
  246: $thesaurus_db{'total.count'}=$totalcount;
  247: untie %thesaurus_db;
  248: 
  249: ##
  250: ## Perform test lookups
  251: ##
  252: if ($test) {
  253:     if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
  254:         die "Error opening DB file.\n";
  255:     }
  256:     foreach my $word ('torque','rna','polymerase') {
  257:         my $result = $thesaurus_db{$word};
  258:         print "Results for $word = $result\n" if ($result);
  259:     }
  260:     untie %thesaurus_db;
  261: }
  262: 
  263: 
  264: ################################################################
  265: ################################################################
  266: #
  267: # get_related($keyword) is a utility function which will return a string
  268: #     of the format: 
  269: #        keyword1,frequency1:keyword2,frequency2:.....
  270: #
  271: #     'frequency1' is the number of times the keyword1 appears in a keywords
  272: #     list with $keyword.
  273: #
  274: sub get_related {
  275:     my $keyword = shift;
  276:     return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
  277:     my %related_hash = %{$related_words{$keyword}};
  278:     my @Related_words = keys(%{$related_words{$keyword}});
  279:     @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } 
  280:                           @Related_words;
  281:     my $result;
  282:     foreach (@Related_words) {
  283:         $result .= "$_,$related_hash{$_}:";
  284:     }
  285:     chop $result;
  286:     return $result;
  287: }
  288: 
  289: 
  290: 
  291: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>