File:  [LON-CAPA] / loncom / thesaurus / build_thesaurus_db.pl
Revision 1.2: download - view: text, annotated - select for diffs
Tue Aug 5 15:51:37 2003 UTC (20 years, 8 months ago) by matthew
Branches: MAIN
CVS tags: version_2_9_X, version_2_9_99_0, version_2_9_1, version_2_9_0, version_2_8_X, version_2_8_99_1, version_2_8_99_0, version_2_8_2, version_2_8_1, version_2_8_0, version_2_7_X, version_2_7_99_1, version_2_7_99_0, version_2_7_1, version_2_7_0, version_2_6_X, version_2_6_99_1, version_2_6_99_0, version_2_6_3, version_2_6_2, version_2_6_1, version_2_6_0, version_2_5_X, version_2_5_99_1, version_2_5_99_0, version_2_5_2, version_2_5_1, version_2_5_0, version_2_4_X, version_2_4_99_0, version_2_4_2, version_2_4_1, version_2_4_0, version_2_3_X, version_2_3_99_0, version_2_3_2, version_2_3_1, version_2_3_0, version_2_2_X, version_2_2_99_1, version_2_2_99_0, version_2_2_2, version_2_2_1, version_2_2_0, version_2_1_X, version_2_1_99_3, version_2_1_99_2, version_2_1_99_1, version_2_1_99_0, version_2_1_3, version_2_1_2, version_2_1_1, version_2_1_0, version_2_12_X, version_2_11_X, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, version_2_11_2_uiuc, version_2_11_2_msu, version_2_11_2_educog, version_2_11_2, version_2_11_1, version_2_11_0_RC3, version_2_11_0_RC2, version_2_11_0_RC1, version_2_11_0, version_2_10_X, version_2_10_1, version_2_10_0_RC2, version_2_10_0_RC1, version_2_10_0, version_2_0_X, version_2_0_99_1, version_2_0_2, version_2_0_1, version_2_0_0, version_1_99_3, version_1_99_2, version_1_99_1_tmcc, version_1_99_1, version_1_99_0_tmcc, version_1_99_0, version_1_3_X, version_1_3_3, version_1_3_2, version_1_3_1, version_1_3_0, version_1_2_X, version_1_2_99_1, version_1_2_99_0, version_1_2_1, version_1_2_0, version_1_1_X, version_1_1_99_5, version_1_1_99_4, version_1_1_99_3, version_1_1_99_2, version_1_1_99_1, version_1_1_99_0, version_1_1_3, version_1_1_2, version_1_1_1, version_1_1_0, version_1_0_99_3, version_1_0_99_2, version_1_0_99_1, version_1_0_99, loncapaMITrelate_1, language_hyphenation_merge, language_hyphenation, bz6209-base, bz6209, bz5969, bz2851, PRINT_INCOMPLETE_base, PRINT_INCOMPLETE, HEAD, GCI_3, GCI_2, GCI_1, BZ5971-printing-apage, BZ5434-fox, BZ4492-merge, BZ4492-feature_horizontal_radioresponse
Bug 1492: build_thesaurus_db.pl has new command line switch --checkdates
to only build the thesaurus if the dependencies are newer than the target.

    1: #!/usr/bin/perl -w
    2: #
    3: # $Id: build_thesaurus_db.pl,v 1.2 2003/08/05 15:51:37 matthew Exp $
    4: #
    5: #
    6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
    7: #
    8: # Copyright Michigan State University Board of Trustees
    9: #
   10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
   11: #
   12: # LON-CAPA is free software; you can redistribute it and/or modify
   13: # it under the terms of the GNU General Public License as published by
   14: # the Free Software Foundation; either version 2 of the License, or
   15: # (at your option) any later version.
   16: #
   17: # LON-CAPA is distributed in the hope that it will be useful,
   18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   20: # GNU General Public License for more details.
   21: #
   22: # You should have received a copy of the GNU General Public License
   23: # along with LON-CAPA; if not, write to the Free Software
   24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   25: #
   26: # /home/httpd/html/adm/gpl.txt
   27: #
   28: # http://www.lon-capa.org/
   29: #
   30: use strict;
   31: use Getopt::Long;
   32: use GDBM_File;
   33: # POD required stuff:
   34: 
   35: =pod
   36: 
   37: =head1 NAME
   38: 
   39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
   40: 
   41: =head1 SYNOPSIS
   42: 
   43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
   44: 
   45: =head1 DESCRIPTION
   46: 
   47: build_thesaurus_db.pl reads two input files.  The first is a list of words to
   48: omit from the thesaurus.  The second is the raw keyword data for the thesaurus.
   49: From this file a database is built.
   50: 
   51: =head1 DATABASE FORMAT DESCRIPTION
   52: 
   53: The structure of the database entries is described below.  
   54: 
   55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   56: 
   57: Allow me to repeat myself:
   58: 
   59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   60: 
   61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   62: 
   63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
   64: 
   65: Got it?  While you are reading this, let me encourage you to document
   66: any changes to the structure of the database.  It is not that hard and
   67: you will save much time if you do.  
   68: 
   69: That said, you should make sure the description below actually matches
   70: the code, just to be safe.
   71: 
   72: This concludes the lecture portion of the comments.
   73: 
   74: =head1 DATABASE FORMAT DESCRIPTION
   75: 
   76: An entry in the database for a given word is shown below:
   77: 
   78:  polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
   79:               |   |  |
   80:               |   |  The number of times dna appeared in a keywords list
   81:               |   |  with the word polymerase. 
   82:               |   The related keyword
   83:               The number of times polymerase appeared in a keywords list.
   84: 
   85: Note: the related words list will be in descending order of occurance with 
   86: the keyword.
   87: 
   88: =head1 COMMAND LINE OPTIONS
   89: 
   90: =over 4
   91: 
   92: 
   93: =item --badwordfile <filename>
   94: 
   95: filename must contain a list of words not to put in the thesaurus.  
   96: Each word must appear on its own line.
   97: Currently comments are not supported.
   98: 
   99: =item --keywordfile <filename>
  100: 
  101: File containing the raw word data for the thesaurus.  Each line must be 
  102: comma seperated list of related keywords.
  103: 
  104: =item --outputdb <filename>
  105: 
  106: file to write the LON-CAPA thesaurus database to.
  107: 
  108: =item --help
  109: 
  110: Display this help message and exit.
  111: 
  112: =item --test
  113: 
  114: Run a few test lookups after writing the database.
  115: 
  116: =back
  117: 
  118: The following example shows the default values for each parameter
  119: 
  120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
  121: 
  122: =cut
  123: 
  124: ##
  125: ## Get command line parameters
  126: ##
  127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$checkdates,$test);
  128: GetOptions( "badwordfile=s" => \$badwordfile,   # --badwordfile
  129:             "outputdb=s"    => \$outputdbfile,  # --outputdb
  130:             "keywordfile=s" => \$keywordfile,   # --keywordfile
  131:             "help"          => \$help,          # --help
  132:             "checkdates"    => \$checkdates,    # --checkdates
  133:             "test"          => \$test);         # --test
  134: 
  135: 
  136: ##
  137: ## Help! Help!
  138: ##
  139: if ($help) {
  140:     print <<ENDHELP;
  141: build_thesaurus_db.pl     Build a LON-CAPA thesaurus database.
  142: 
  143: Command line arguements
  144:    --badwordfile <filename>     filename must contain a list of words not to
  145:                                 put in the thesaurus.  Each word must appear
  146:                                 on its own line and currently comments are not
  147:                                 supported.
  148:    --checkdates                 Check the creation dates on the files involved
  149:                                 and only run if the outputdb file was created
  150:                                 prior to one of the badword or keyword files.
  151:    --keywordfile <filename>     File containing the raw word data for the
  152:                                 thesaurus.  Each line must be comma seperated
  153:                                 list of related keywords.
  154:    --outputdb <filename>        file to write the LON-CAPA thesaurus database
  155:                                 to.
  156:    --help                       Display this help message and exit.
  157:    --test                       Run a few test lookups after writing the 
  158:                                 database.
  159: The following example shows the default values for each parameter
  160: 
  161: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
  162:      --outputdb ./thesaurus.db --keywordfile rawkey.txt
  163: 
  164: ENDHELP
  165:     exit;
  166: }
  167: 
  168: ##
  169: ## Set up defaults for parameters and check validity
  170: ##
  171: $badwordfile  = $badwordfile  || "./un_keyword.tab";
  172: $outputdbfile = $outputdbfile || "./thesaurus.db";
  173: $keywordfile  = $keywordfile  || "./rawkey.txt";
  174: 
  175: foreach my $file ($badwordfile,$keywordfile) {
  176:     die "$file does not exist." if (! -e $file);
  177: }
  178: 
  179: #
  180: # Check the dates on the input files to be sure we need to run
  181: if ($checkdates && -s $outputdbfile) {
  182:     my @Results = stat($badwordfile);
  183:     my $highest_dependency_ctime = $Results[10];
  184:     foreach ($keywordfile) {
  185:         if ($Results[10] > $highest_dependency_ctime) {
  186:             $highest_dependency_ctime = $Results[10];
  187:         }
  188:     }
  189:     #
  190:     # if the outputdbfile was made AFTER the last version of one of the
  191:     # dependencies, exit quietly.
  192:     @Results = stat($outputdbfile);
  193:     if ($highest_dependency_ctime < $Results[10]) { 
  194:         exit;
  195:     }
  196: }
  197: 
  198: ##
  199: ## Global hashes.
  200: ##
  201: my %wordcount = ();    # Holds the number of times each word appears in the
  202:                        # input file.
  203: my %related_words=();  # Holds the words related to a word.  The keys of this
  204:                        # has are words, and the values are pointers to hashes
  205:                        # which hold the words and their frequencies.
  206: my %isbad;             # Holds an entry for each keyword that is 'bad'
  207: 
  208: ##
  209: ## Initialize hash of bad words.  'bad' meaning their appearance in a keyword
  210: ## list does not add information.  Not 'bad' meaning profane.  
  211: ##
  212: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
  213: while (<BAD>) {
  214:     chomp;
  215:     $isbad{lc($_)}++;
  216: }
  217: close BAD;
  218: 
  219: ##
  220: ## Read in the data file and construction related words hash.  Skip bad words.
  221: ##
  222: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
  223: while (<IN>) {
  224:     chomp;
  225:     my @Words = split(/\W+/,lc($_));
  226:     foreach my $keyword (@Words) {
  227:         next if ($isbad{$keyword});
  228:         $wordcount{$keyword}++;
  229:         foreach my $otherword (@Words) {
  230:             next if (($otherword eq $keyword) || ($isbad{$otherword}));
  231:             $related_words{$keyword}->{$otherword}++;
  232:         }
  233:     }
  234: }
  235: close(IN);
  236: 
  237: ##
  238: ## Determine average number of entries
  239: ##
  240: my $totalcount;
  241: foreach (keys(%wordcount)) {
  242:     $totalcount+=$wordcount{$_};
  243: }
  244: my $avecount = $totalcount /(scalar keys(%wordcount));
  245: 
  246: ##
  247: ## Make sure we can write the database.
  248: ##
  249: if (-e $outputdbfile) {
  250:     die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
  251: }
  252: my %thesaurus_db;
  253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
  254:     die "Error opening DB file.\n";
  255: }
  256: 
  257: ##
  258: ## Write the database file
  259: ##
  260: foreach my $word (keys(%related_words)) {
  261:     next if (! defined($word));
  262:     my $result = &get_related($word);
  263:     $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
  264: }
  265: 
  266: ##
  267: ## Store away special values (must contain characters not matched by \w)
  268: ##
  269: $thesaurus_db{'average.count'}=$avecount;
  270: $thesaurus_db{'total.count'}=$totalcount;
  271: untie %thesaurus_db;
  272: 
  273: ##
  274: ## Perform test lookups
  275: ##
  276: if ($test) {
  277:     if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
  278:         die "Error opening DB file.\n";
  279:     }
  280:     foreach my $word ('torque','rna','polymerase') {
  281:         my $result = $thesaurus_db{$word};
  282:         print "Results for $word = $result\n" if ($result);
  283:     }
  284:     untie %thesaurus_db;
  285: }
  286: 
  287: 
  288: ################################################################
  289: ################################################################
  290: #
  291: # get_related($keyword) is a utility function which will return a string
  292: #     of the format: 
  293: #        keyword1,frequency1:keyword2,frequency2:.....
  294: #
  295: #     'frequency1' is the number of times the keyword1 appears in a keywords
  296: #     list with $keyword.
  297: #
  298: sub get_related {
  299:     my $keyword = shift;
  300:     return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
  301:     my %related_hash = %{$related_words{$keyword}};
  302:     my @Related_words = keys(%{$related_words{$keyword}});
  303:     @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } 
  304:                           @Related_words;
  305:     my $result;
  306:     foreach (@Related_words) {
  307:         $result .= "$_,$related_hash{$_}:";
  308:     }
  309:     chop $result;
  310:     return $result;
  311: }
  312: 
  313: 
  314: 
  315: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>