Annotation of loncom/thesaurus/build_thesaurus_db.pl, revision 1.1

1.1     ! matthew     1: #!/usr/bin/perl -w
        !             2: #
        !             3: # $Id$
        !             4: #
        !             5: #
        !             6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
        !             7: #
        !             8: # Copyright Michigan State University Board of Trustees
        !             9: #
        !            10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
        !            11: #
        !            12: # LON-CAPA is free software; you can redistribute it and/or modify
        !            13: # it under the terms of the GNU General Public License as published by
        !            14: # the Free Software Foundation; either version 2 of the License, or
        !            15: # (at your option) any later version.
        !            16: #
        !            17: # LON-CAPA is distributed in the hope that it will be useful,
        !            18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
        !            20: # GNU General Public License for more details.
        !            21: #
        !            22: # You should have received a copy of the GNU General Public License
        !            23: # along with LON-CAPA; if not, write to the Free Software
        !            24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
        !            25: #
        !            26: # /home/httpd/html/adm/gpl.txt
        !            27: #
        !            28: # http://www.lon-capa.org/
        !            29: #
        !            30: use strict;
        !            31: use Getopt::Long;
        !            32: use GDBM_File;
        !            33: # POD required stuff:
        !            34: 
        !            35: =pod
        !            36: 
        !            37: =head1 NAME
        !            38: 
        !            39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
        !            40: 
        !            41: =head1 SYNOPSIS
        !            42: 
        !            43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
        !            44: 
        !            45: =head1 DESCRIPTION
        !            46: 
        !            47: build_thesaurus_db.pl reads two input files.  The first is a list of words to
        !            48: omit from the thesaurus.  The second is the raw keyword data for the thesaurus.
        !            49: From this file a database is built.
        !            50: 
        !            51: =head1 DATABASE FORMAT DESCRIPTION
        !            52: 
        !            53: The structure of the database entries is described below.  
        !            54: 
        !            55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
        !            56: 
        !            57: Allow me to repeat myself:
        !            58: 
        !            59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
        !            60: 
        !            61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
        !            62: 
        !            63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
        !            64: 
        !            65: Got it?  While you are reading this, let me encourage you to document
        !            66: any changes to the structure of the database.  It is not that hard and
        !            67: you will save much time if you do.  
        !            68: 
        !            69: That said, you should make sure the description below actually matches
        !            70: the code, just to be safe.
        !            71: 
        !            72: This concludes the lecture portion of the comments.
        !            73: 
        !            74: =head1 DATABASE FORMAT DESCRIPTION
        !            75: 
        !            76: An entry in the database for a given word is shown below:
        !            77: 
        !            78:  polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
        !            79:               |   |  |
        !            80:               |   |  The number of times dna appeared in a keywords list
        !            81:               |   |  with the word polymerase. 
        !            82:               |   The related keyword
        !            83:               The number of times polymerase appeared in a keywords list.
        !            84: 
        !            85: Note: the related words list will be in descending order of occurance with 
        !            86: the keyword.
        !            87: 
        !            88: =head1 COMMAND LINE OPTIONS
        !            89: 
        !            90: =over 4
        !            91: 
        !            92: 
        !            93: =item --badwordfile <filename>
        !            94: 
        !            95: filename must contain a list of words not to put in the thesaurus.  
        !            96: Each word must appear on its own line.
        !            97: Currently comments are not supported.
        !            98: 
        !            99: =item --keywordfile <filename>
        !           100: 
        !           101: File containing the raw word data for the thesaurus.  Each line must be 
        !           102: comma seperated list of related keywords.
        !           103: 
        !           104: =item --outputdb <filename>
        !           105: 
        !           106: file to write the LON-CAPA thesaurus database to.
        !           107: 
        !           108: =item --help
        !           109: 
        !           110: Display this help message and exit.
        !           111: 
        !           112: =item --test
        !           113: 
        !           114: Run a few test lookups after writing the database.
        !           115: 
        !           116: =back
        !           117: 
        !           118: The following example shows the default values for each parameter
        !           119: 
        !           120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
        !           121: 
        !           122: =cut
        !           123: 
        !           124: ##
        !           125: ## Get command line parameters
        !           126: ##
        !           127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$test);
        !           128: GetOptions( "badwordfile=s" => \$badwordfile,   # --badwordfile
        !           129:             "outputdb=s"    => \$outputdbfile,  # --outputdb
        !           130:             "keywordfile=s" => \$keywordfile,   # --keywordfile
        !           131:             "help"          => \$help,          # --help
        !           132:             "test"          => \$test);         # --test
        !           133: 
        !           134: ##
        !           135: ## Help! Help!
        !           136: ##
        !           137: if ($help) {
        !           138:     print <<ENDHELP;
        !           139: build_thesaurus_db.pl     Build a LON-CAPA thesaurus database.
        !           140: 
        !           141: Command line arguements
        !           142:    --badwordfile <filename>     filename must contain a list of words not to
        !           143:                                 put in the thesaurus.  Each word must appear
        !           144:                                 on its own line and currently comments are not
        !           145:                                 supported.
        !           146:    --keywordfile <filename>     File containing the raw word data for the
        !           147:                                 thesaurus.  Each line must be comma seperated
        !           148:                                 list of related keywords.
        !           149:    --outputdb <filename>        file to write the LON-CAPA thesaurus database
        !           150:                                 to.
        !           151:    --help                       Display this help message and exit.
        !           152:    --test                       Run a few test lookups after writing the 
        !           153:                                 database.
        !           154: The following example shows the default values for each parameter
        !           155: 
        !           156: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
        !           157:      --outputdb ./thesaurus.db --keywordfile rawkey.txt
        !           158: 
        !           159: ENDHELP
        !           160:     exit;
        !           161: }
        !           162: 
        !           163: ##
        !           164: ## Set up defaults for parameters and check validity
        !           165: ##
        !           166: $badwordfile  = $badwordfile  || "./un_keyword.tab";
        !           167: $outputdbfile = $outputdbfile || "./thesaurus.db";
        !           168: $keywordfile  = $keywordfile  || "./rawkey.txt";
        !           169: 
        !           170: foreach my $file ($badwordfile,$keywordfile) {
        !           171:     die "$file does not exist." if (! -e $file);
        !           172: }
        !           173: 
        !           174: ##
        !           175: ## Global hashes.
        !           176: ##
        !           177: my %wordcount = ();    # Holds the number of times each word appears in the
        !           178:                        # input file.
        !           179: my %related_words=();  # Holds the words related to a word.  The keys of this
        !           180:                        # has are words, and the values are pointers to hashes
        !           181:                        # which hold the words and their frequencies.
        !           182: my %isbad;             # Holds an entry for each keyword that is 'bad'
        !           183: 
        !           184: ##
        !           185: ## Initialize hash of bad words.  'bad' meaning their appearance in a keyword
        !           186: ## list does not add information.  Not 'bad' meaning profane.  
        !           187: ##
        !           188: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
        !           189: while (<BAD>) {
        !           190:     chomp;
        !           191:     $isbad{lc($_)}++;
        !           192: }
        !           193: close BAD;
        !           194: 
        !           195: ##
        !           196: ## Read in the data file and construction related words hash.  Skip bad words.
        !           197: ##
        !           198: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
        !           199: while (<IN>) {
        !           200:     chomp;
        !           201:     my @Words = split(/\W+/,lc($_));
        !           202:     foreach my $keyword (@Words) {
        !           203:         next if ($isbad{$keyword});
        !           204:         $wordcount{$keyword}++;
        !           205:         foreach my $otherword (@Words) {
        !           206:             next if (($otherword eq $keyword) || ($isbad{$otherword}));
        !           207:             $related_words{$keyword}->{$otherword}++;
        !           208:         }
        !           209:     }
        !           210: }
        !           211: close(IN);
        !           212: 
        !           213: ##
        !           214: ## Determine average number of entries
        !           215: ##
        !           216: my $totalcount;
        !           217: foreach (keys(%wordcount)) {
        !           218:     $totalcount+=$wordcount{$_};
        !           219: }
        !           220: my $avecount = $totalcount /(scalar keys(%wordcount));
        !           221: 
        !           222: ##
        !           223: ## Make sure we can write the database.
        !           224: ##
        !           225: if (-e $outputdbfile) {
        !           226:     die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
        !           227: }
        !           228: my %thesaurus_db;
        !           229: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
        !           230:     die "Error opening DB file.\n";
        !           231: }
        !           232: 
        !           233: ##
        !           234: ## Write the database file
        !           235: ##
        !           236: foreach my $word (keys(%related_words)) {
        !           237:     next if (! defined($word));
        !           238:     my $result = &get_related($word);
        !           239:     $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
        !           240: }
        !           241: 
        !           242: ##
        !           243: ## Store away special values (must contain characters not matched by \w)
        !           244: ##
        !           245: $thesaurus_db{'average.count'}=$avecount;
        !           246: $thesaurus_db{'total.count'}=$totalcount;
        !           247: untie %thesaurus_db;
        !           248: 
        !           249: ##
        !           250: ## Perform test lookups
        !           251: ##
        !           252: if ($test) {
        !           253:     if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
        !           254:         die "Error opening DB file.\n";
        !           255:     }
        !           256:     foreach my $word ('torque','rna','polymerase') {
        !           257:         my $result = $thesaurus_db{$word};
        !           258:         print "Results for $word = $result\n" if ($result);
        !           259:     }
        !           260:     untie %thesaurus_db;
        !           261: }
        !           262: 
        !           263: 
        !           264: ################################################################
        !           265: ################################################################
        !           266: #
        !           267: # get_related($keyword) is a utility function which will return a string
        !           268: #     of the format: 
        !           269: #        keyword1,frequency1:keyword2,frequency2:.....
        !           270: #
        !           271: #     'frequency1' is the number of times the keyword1 appears in a keywords
        !           272: #     list with $keyword.
        !           273: #
        !           274: sub get_related {
        !           275:     my $keyword = shift;
        !           276:     return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
        !           277:     my %related_hash = %{$related_words{$keyword}};
        !           278:     my @Related_words = keys(%{$related_words{$keyword}});
        !           279:     @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } 
        !           280:                           @Related_words;
        !           281:     my $result;
        !           282:     foreach (@Related_words) {
        !           283:         $result .= "$_,$related_hash{$_}:";
        !           284:     }
        !           285:     chop $result;
        !           286:     return $result;
        !           287: }
        !           288: 
        !           289: 
        !           290: 
        !           291: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>