loncom/thesaurus/build_thesaurus_db.pl - view

File: [LON-CAPA] / loncom / thesaurus / build_thesaurus_db.pl
Revision 1.1: download - view: text, annotated - select for diffs
Thu Jul 11 20:48:31 2002 UTC (21 years, 10 months ago) by matthew
Branches: MAIN
CVS tags: version_1_0_3, version_1_0_2, version_1_0_1, version_1_0_0, version_0_99_5, version_0_99_4, version_0_99_3, version_0_99_2, version_0_99_1, version_0_99_0, version_0_6_2, version_0_6, version_0_5_1, version_0_5, conference_2003, HEAD

Script to build LON-CAPA thesaurus database.

1: #!/usr/bin/perl -w 2: # 3: # $Id: build_thesaurus_db.pl,v 1.1 2002/07/11 20:48:31 matthew Exp $ 4: # 5: # 6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database. 7: # 8: # Copyright Michigan State University Board of Trustees 9: # 10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA). 11: # 12: # LON-CAPA is free software; you can redistribute it and/or modify 13: # it under the terms of the GNU General Public License as published by 14: # the Free Software Foundation; either version 2 of the License, or 15: # (at your option) any later version. 16: # 17: # LON-CAPA is distributed in the hope that it will be useful, 18: # but WITHOUT ANY WARRANTY; without even the implied warranty of 19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20: # GNU General Public License for more details. 21: # 22: # You should have received a copy of the GNU General Public License 23: # along with LON-CAPA; if not, write to the Free Software 24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25: # 26: # /home/httpd/html/adm/gpl.txt 27: # 28: # http://www.lon-capa.org/ 29: # 30: use strict; 31: use Getopt::Long; 32: use GDBM_File; 33: # POD required stuff: 34: 35: =pod 36: 37: =head1 NAME 38: 39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database. 40: 41: =head1 SYNOPSIS 42: 43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database. 44: 45: =head1 DESCRIPTION 46: 47: build_thesaurus_db.pl reads two input files. The first is a list of words to 48: omit from the thesaurus. The second is the raw keyword data for the thesaurus. 49: From this file a database is built. 50: 51: =head1 DATABASE FORMAT DESCRIPTION 52: 53: The structure of the database entries is described below. 54: 55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 56: 57: Allow me to repeat myself: 58: 59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 60: 61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 62: 63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 64: 65: Got it? While you are reading this, let me encourage you to document 66: any changes to the structure of the database. It is not that hard and 67: you will save much time if you do. 68: 69: That said, you should make sure the description below actually matches 70: the code, just to be safe. 71: 72: This concludes the lecture portion of the comments. 73: 74: =head1 DATABASE FORMAT DESCRIPTION 75: 76: An entry in the database for a given word is shown below: 77: 78: polymerase = 42:dna,32:rna,30:transcription,19:protein,16:... 79: | | | 80: | | The number of times dna appeared in a keywords list 81: | | with the word polymerase. 82: | The related keyword 83: The number of times polymerase appeared in a keywords list. 84: 85: Note: the related words list will be in descending order of occurance with 86: the keyword. 87: 88: =head1 COMMAND LINE OPTIONS 89: 90: =over 4 91: 92: 93: =item --badwordfile <filename> 94: 95: filename must contain a list of words not to put in the thesaurus. 96: Each word must appear on its own line. 97: Currently comments are not supported. 98: 99: =item --keywordfile <filename> 100: 101: File containing the raw word data for the thesaurus. Each line must be 102: comma seperated list of related keywords. 103: 104: =item --outputdb <filename> 105: 106: file to write the LON-CAPA thesaurus database to. 107: 108: =item --help 109: 110: Display this help message and exit. 111: 112: =item --test 113: 114: Run a few test lookups after writing the database. 115: 116: =back 117: 118: The following example shows the default values for each parameter 119: 120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt 121: 122: =cut 123: 124: ## 125: ## Get command line parameters 126: ## 127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$test); 128: GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile 129: "outputdb=s" => \$outputdbfile, # --outputdb 130: "keywordfile=s" => \$keywordfile, # --keywordfile 131: "help" => \$help, # --help 132: "test" => \$test); # --test 133: 134: ## 135: ## Help! Help! 136: ## 137: if ($help) { 138: print <<ENDHELP; 139: build_thesaurus_db.pl Build a LON-CAPA thesaurus database. 140: 141: Command line arguements 142: --badwordfile <filename> filename must contain a list of words not to 143: put in the thesaurus. Each word must appear 144: on its own line and currently comments are not 145: supported. 146: --keywordfile <filename> File containing the raw word data for the 147: thesaurus. Each line must be comma seperated 148: list of related keywords. 149: --outputdb <filename> file to write the LON-CAPA thesaurus database 150: to. 151: --help Display this help message and exit. 152: --test Run a few test lookups after writing the 153: database. 154: The following example shows the default values for each parameter 155: 156: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \ 157: --outputdb ./thesaurus.db --keywordfile rawkey.txt 158: 159: ENDHELP 160: exit; 161: } 162: 163: ## 164: ## Set up defaults for parameters and check validity 165: ## 166: $badwordfile = $badwordfile || "./un_keyword.tab"; 167: $outputdbfile = $outputdbfile || "./thesaurus.db"; 168: $keywordfile = $keywordfile || "./rawkey.txt"; 169: 170: foreach my $file ($badwordfile,$keywordfile) { 171: die "$file does not exist." if (! -e $file); 172: } 173: 174: ## 175: ## Global hashes. 176: ## 177: my %wordcount = (); # Holds the number of times each word appears in the 178: # input file. 179: my %related_words=(); # Holds the words related to a word. The keys of this 180: # has are words, and the values are pointers to hashes 181: # which hold the words and their frequencies. 182: my %isbad; # Holds an entry for each keyword that is 'bad' 183: 184: ## 185: ## Initialize hash of bad words. 'bad' meaning their appearance in a keyword 186: ## list does not add information. Not 'bad' meaning profane. 187: ## 188: open BAD,$badwordfile || die "Unable to open ".$badwordfile; 189: while (<BAD>) { 190: chomp; 191: $isbad{lc($_)}++; 192: } 193: close BAD; 194: 195: ## 196: ## Read in the data file and construction related words hash. Skip bad words. 197: ## 198: open(IN,$keywordfile) || die "Unable to open ".$keywordfile; 199: while (<IN>) { 200: chomp; 201: my @Words = split(/\W+/,lc($_)); 202: foreach my $keyword (@Words) { 203: next if ($isbad{$keyword}); 204: $wordcount{$keyword}++; 205: foreach my $otherword (@Words) { 206: next if (($otherword eq $keyword) || ($isbad{$otherword})); 207: $related_words{$keyword}->{$otherword}++; 208: } 209: } 210: } 211: close(IN); 212: 213: ## 214: ## Determine average number of entries 215: ## 216: my $totalcount; 217: foreach (keys(%wordcount)) { 218: $totalcount+=$wordcount{$_}; 219: } 220: my $avecount = $totalcount /(scalar keys(%wordcount)); 221: 222: ## 223: ## Make sure we can write the database. 224: ## 225: if (-e $outputdbfile) { 226: die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile); 227: } 228: my %thesaurus_db; 229: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) { 230: die "Error opening DB file.\n"; 231: } 232: 233: ## 234: ## Write the database file 235: ## 236: foreach my $word (keys(%related_words)) { 237: next if (! defined($word)); 238: my $result = &get_related($word); 239: $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result); 240: } 241: 242: ## 243: ## Store away special values (must contain characters not matched by \w) 244: ## 245: $thesaurus_db{'average.count'}=$avecount; 246: $thesaurus_db{'total.count'}=$totalcount; 247: untie %thesaurus_db; 248: 249: ## 250: ## Perform test lookups 251: ## 252: if ($test) { 253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) { 254: die "Error opening DB file.\n"; 255: } 256: foreach my $word ('torque','rna','polymerase') { 257: my $result = $thesaurus_db{$word}; 258: print "Results for $word = $result\n" if ($result); 259: } 260: untie %thesaurus_db; 261: } 262: 263: 264: ################################################################ 265: ################################################################ 266: # 267: # get_related($keyword) is a utility function which will return a string 268: # of the format: 269: # keyword1,frequency1:keyword2,frequency2:..... 270: # 271: # 'frequency1' is the number of times the keyword1 appears in a keywords 272: # list with $keyword. 273: # 274: sub get_related { 275: my $keyword = shift; 276: return undef if ((! $keyword) ||(! exists($related_words{$keyword}))); 277: my %related_hash = %{$related_words{$keyword}}; 278: my @Related_words = keys(%{$related_words{$keyword}}); 279: @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } 280: @Related_words; 281: my $result; 282: foreach (@Related_words) { 283: $result .= "$_,$related_hash{$_}:"; 284: } 285: chop $result; 286: return $result; 287: } 288: 289: 290: 291: