loncom/thesaurus/build_thesaurus_db.pl - view

File: [LON-CAPA] / loncom / thesaurus / build_thesaurus_db.pl
Revision 1.2: download - view: text, annotated - select for diffs
Tue Aug 5 15:51:37 2003 UTC (20 years, 8 months ago) by matthew
Branches: MAIN
CVS tags: version_2_9_X, version_2_9_99_0, version_2_9_1, version_2_9_0, version_2_8_X, version_2_8_99_1, version_2_8_99_0, version_2_8_2, version_2_8_1, version_2_8_0, version_2_7_X, version_2_7_99_1, version_2_7_99_0, version_2_7_1, version_2_7_0, version_2_6_X, version_2_6_99_1, version_2_6_99_0, version_2_6_3, version_2_6_2, version_2_6_1, version_2_6_0, version_2_5_X, version_2_5_99_1, version_2_5_99_0, version_2_5_2, version_2_5_1, version_2_5_0, version_2_4_X, version_2_4_99_0, version_2_4_2, version_2_4_1, version_2_4_0, version_2_3_X, version_2_3_99_0, version_2_3_2, version_2_3_1, version_2_3_0, version_2_2_X, version_2_2_99_1, version_2_2_99_0, version_2_2_2, version_2_2_1, version_2_2_0, version_2_1_X, version_2_1_99_3, version_2_1_99_2, version_2_1_99_1, version_2_1_99_0, version_2_1_3, version_2_1_2, version_2_1_1, version_2_1_0, version_2_12_X, version_2_11_X, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, version_2_11_2_uiuc, version_2_11_2_msu, version_2_11_2_educog, version_2_11_2, version_2_11_1, version_2_11_0_RC3, version_2_11_0_RC2, version_2_11_0_RC1, version_2_11_0, version_2_10_X, version_2_10_1, version_2_10_0_RC2, version_2_10_0_RC1, version_2_10_0, version_2_0_X, version_2_0_99_1, version_2_0_2, version_2_0_1, version_2_0_0, version_1_99_3, version_1_99_2, version_1_99_1_tmcc, version_1_99_1, version_1_99_0_tmcc, version_1_99_0, version_1_3_X, version_1_3_3, version_1_3_2, version_1_3_1, version_1_3_0, version_1_2_X, version_1_2_99_1, version_1_2_99_0, version_1_2_1, version_1_2_0, version_1_1_X, version_1_1_99_5, version_1_1_99_4, version_1_1_99_3, version_1_1_99_2, version_1_1_99_1, version_1_1_99_0, version_1_1_3, version_1_1_2, version_1_1_1, version_1_1_0, version_1_0_99_3, version_1_0_99_2, version_1_0_99_1, version_1_0_99, loncapaMITrelate_1, language_hyphenation_merge, language_hyphenation, bz6209-base, bz6209, bz5969, bz2851, PRINT_INCOMPLETE_base, PRINT_INCOMPLETE, HEAD, GCI_3, GCI_2, GCI_1, BZ5971-printing-apage, BZ5434-fox, BZ4492-merge, BZ4492-feature_horizontal_radioresponse

Bug 1492: build_thesaurus_db.pl has new command line switch --checkdates
to only build the thesaurus if the dependencies are newer than the target.

1: #!/usr/bin/perl -w 2: # 3: # $Id: build_thesaurus_db.pl,v 1.2 2003/08/05 15:51:37 matthew Exp $ 4: # 5: # 6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database. 7: # 8: # Copyright Michigan State University Board of Trustees 9: # 10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA). 11: # 12: # LON-CAPA is free software; you can redistribute it and/or modify 13: # it under the terms of the GNU General Public License as published by 14: # the Free Software Foundation; either version 2 of the License, or 15: # (at your option) any later version. 16: # 17: # LON-CAPA is distributed in the hope that it will be useful, 18: # but WITHOUT ANY WARRANTY; without even the implied warranty of 19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20: # GNU General Public License for more details. 21: # 22: # You should have received a copy of the GNU General Public License 23: # along with LON-CAPA; if not, write to the Free Software 24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25: # 26: # /home/httpd/html/adm/gpl.txt 27: # 28: # http://www.lon-capa.org/ 29: # 30: use strict; 31: use Getopt::Long; 32: use GDBM_File; 33: # POD required stuff: 34: 35: =pod 36: 37: =head1 NAME 38: 39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database. 40: 41: =head1 SYNOPSIS 42: 43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database. 44: 45: =head1 DESCRIPTION 46: 47: build_thesaurus_db.pl reads two input files. The first is a list of words to 48: omit from the thesaurus. The second is the raw keyword data for the thesaurus. 49: From this file a database is built. 50: 51: =head1 DATABASE FORMAT DESCRIPTION 52: 53: The structure of the database entries is described below. 54: 55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 56: 57: Allow me to repeat myself: 58: 59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 60: 61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 62: 63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! 64: 65: Got it? While you are reading this, let me encourage you to document 66: any changes to the structure of the database. It is not that hard and 67: you will save much time if you do. 68: 69: That said, you should make sure the description below actually matches 70: the code, just to be safe. 71: 72: This concludes the lecture portion of the comments. 73: 74: =head1 DATABASE FORMAT DESCRIPTION 75: 76: An entry in the database for a given word is shown below: 77: 78: polymerase = 42:dna,32:rna,30:transcription,19:protein,16:... 79: | | | 80: | | The number of times dna appeared in a keywords list 81: | | with the word polymerase. 82: | The related keyword 83: The number of times polymerase appeared in a keywords list. 84: 85: Note: the related words list will be in descending order of occurance with 86: the keyword. 87: 88: =head1 COMMAND LINE OPTIONS 89: 90: =over 4 91: 92: 93: =item --badwordfile <filename> 94: 95: filename must contain a list of words not to put in the thesaurus. 96: Each word must appear on its own line. 97: Currently comments are not supported. 98: 99: =item --keywordfile <filename> 100: 101: File containing the raw word data for the thesaurus. Each line must be 102: comma seperated list of related keywords. 103: 104: =item --outputdb <filename> 105: 106: file to write the LON-CAPA thesaurus database to. 107: 108: =item --help 109: 110: Display this help message and exit. 111: 112: =item --test 113: 114: Run a few test lookups after writing the database. 115: 116: =back 117: 118: The following example shows the default values for each parameter 119: 120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt 121: 122: =cut 123: 124: ## 125: ## Get command line parameters 126: ## 127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$checkdates,$test); 128: GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile 129: "outputdb=s" => \$outputdbfile, # --outputdb 130: "keywordfile=s" => \$keywordfile, # --keywordfile 131: "help" => \$help, # --help 132: "checkdates" => \$checkdates, # --checkdates 133: "test" => \$test); # --test 134: 135: 136: ## 137: ## Help! Help! 138: ## 139: if ($help) { 140: print <<ENDHELP; 141: build_thesaurus_db.pl Build a LON-CAPA thesaurus database. 142: 143: Command line arguements 144: --badwordfile <filename> filename must contain a list of words not to 145: put in the thesaurus. Each word must appear 146: on its own line and currently comments are not 147: supported. 148: --checkdates Check the creation dates on the files involved 149: and only run if the outputdb file was created 150: prior to one of the badword or keyword files. 151: --keywordfile <filename> File containing the raw word data for the 152: thesaurus. Each line must be comma seperated 153: list of related keywords. 154: --outputdb <filename> file to write the LON-CAPA thesaurus database 155: to. 156: --help Display this help message and exit. 157: --test Run a few test lookups after writing the 158: database. 159: The following example shows the default values for each parameter 160: 161: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \ 162: --outputdb ./thesaurus.db --keywordfile rawkey.txt 163: 164: ENDHELP 165: exit; 166: } 167: 168: ## 169: ## Set up defaults for parameters and check validity 170: ## 171: $badwordfile = $badwordfile || "./un_keyword.tab"; 172: $outputdbfile = $outputdbfile || "./thesaurus.db"; 173: $keywordfile = $keywordfile || "./rawkey.txt"; 174: 175: foreach my $file ($badwordfile,$keywordfile) { 176: die "$file does not exist." if (! -e $file); 177: } 178: 179: # 180: # Check the dates on the input files to be sure we need to run 181: if ($checkdates && -s $outputdbfile) { 182: my @Results = stat($badwordfile); 183: my $highest_dependency_ctime = $Results[10]; 184: foreach ($keywordfile) { 185: if ($Results[10] > $highest_dependency_ctime) { 186: $highest_dependency_ctime = $Results[10]; 187: } 188: } 189: # 190: # if the outputdbfile was made AFTER the last version of one of the 191: # dependencies, exit quietly. 192: @Results = stat($outputdbfile); 193: if ($highest_dependency_ctime < $Results[10]) { 194: exit; 195: } 196: } 197: 198: ## 199: ## Global hashes. 200: ## 201: my %wordcount = (); # Holds the number of times each word appears in the 202: # input file. 203: my %related_words=(); # Holds the words related to a word. The keys of this 204: # has are words, and the values are pointers to hashes 205: # which hold the words and their frequencies. 206: my %isbad; # Holds an entry for each keyword that is 'bad' 207: 208: ## 209: ## Initialize hash of bad words. 'bad' meaning their appearance in a keyword 210: ## list does not add information. Not 'bad' meaning profane. 211: ## 212: open BAD,$badwordfile || die "Unable to open ".$badwordfile; 213: while (<BAD>) { 214: chomp; 215: $isbad{lc($_)}++; 216: } 217: close BAD; 218: 219: ## 220: ## Read in the data file and construction related words hash. Skip bad words. 221: ## 222: open(IN,$keywordfile) || die "Unable to open ".$keywordfile; 223: while (<IN>) { 224: chomp; 225: my @Words = split(/\W+/,lc($_)); 226: foreach my $keyword (@Words) { 227: next if ($isbad{$keyword}); 228: $wordcount{$keyword}++; 229: foreach my $otherword (@Words) { 230: next if (($otherword eq $keyword) || ($isbad{$otherword})); 231: $related_words{$keyword}->{$otherword}++; 232: } 233: } 234: } 235: close(IN); 236: 237: ## 238: ## Determine average number of entries 239: ## 240: my $totalcount; 241: foreach (keys(%wordcount)) { 242: $totalcount+=$wordcount{$_}; 243: } 244: my $avecount = $totalcount /(scalar keys(%wordcount)); 245: 246: ## 247: ## Make sure we can write the database. 248: ## 249: if (-e $outputdbfile) { 250: die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile); 251: } 252: my %thesaurus_db; 253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) { 254: die "Error opening DB file.\n"; 255: } 256: 257: ## 258: ## Write the database file 259: ## 260: foreach my $word (keys(%related_words)) { 261: next if (! defined($word)); 262: my $result = &get_related($word); 263: $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result); 264: } 265: 266: ## 267: ## Store away special values (must contain characters not matched by \w) 268: ## 269: $thesaurus_db{'average.count'}=$avecount; 270: $thesaurus_db{'total.count'}=$totalcount; 271: untie %thesaurus_db; 272: 273: ## 274: ## Perform test lookups 275: ## 276: if ($test) { 277: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) { 278: die "Error opening DB file.\n"; 279: } 280: foreach my $word ('torque','rna','polymerase') { 281: my $result = $thesaurus_db{$word}; 282: print "Results for $word = $result\n" if ($result); 283: } 284: untie %thesaurus_db; 285: } 286: 287: 288: ################################################################ 289: ################################################################ 290: # 291: # get_related($keyword) is a utility function which will return a string 292: # of the format: 293: # keyword1,frequency1:keyword2,frequency2:..... 294: # 295: # 'frequency1' is the number of times the keyword1 appears in a keywords 296: # list with $keyword. 297: # 298: sub get_related { 299: my $keyword = shift; 300: return undef if ((! $keyword) ||(! exists($related_words{$keyword}))); 301: my %related_hash = %{$related_words{$keyword}}; 302: my @Related_words = keys(%{$related_words{$keyword}}); 303: @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } 304: @Related_words; 305: my $result; 306: foreach (@Related_words) { 307: $result .= "$_,$related_hash{$_}:"; 308: } 309: chop $result; 310: return $result; 311: } 312: 313: 314: 315: