#!/usr/bin/perl -w # # $Id: build_thesaurus_db.pl,v 1.2 2003/08/05 15:51:37 matthew Exp $ # # # build_thesaurus_db.pl creates the LON-CAPA thesaurus database. # # Copyright Michigan State University Board of Trustees # # This file is part of the LearningOnline Network with CAPA (LON-CAPA). # # LON-CAPA is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # LON-CAPA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with LON-CAPA; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # /home/httpd/html/adm/gpl.txt # # http://www.lon-capa.org/ # use strict; use Getopt::Long; use GDBM_File; # POD required stuff: =pod =head1 NAME build_thesaurus_db.pl - Build the LON-CAPA thesaurus database. =head1 SYNOPSIS build_thesaurus_db.pl creates the LON-CAPA thesaurus database. =head1 DESCRIPTION build_thesaurus_db.pl reads two input files. The first is a list of words to omit from the thesaurus. The second is the raw keyword data for the thesaurus. From this file a database is built. =head1 DATABASE FORMAT DESCRIPTION The structure of the database entries is described below. =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! Allow me to repeat myself: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm! Got it? While you are reading this, let me encourage you to document any changes to the structure of the database. It is not that hard and you will save much time if you do. That said, you should make sure the description below actually matches the code, just to be safe. This concludes the lecture portion of the comments. =head1 DATABASE FORMAT DESCRIPTION An entry in the database for a given word is shown below: polymerase = 42:dna,32:rna,30:transcription,19:protein,16:... | | | | | The number of times dna appeared in a keywords list | | with the word polymerase. | The related keyword The number of times polymerase appeared in a keywords list. Note: the related words list will be in descending order of occurance with the keyword. =head1 COMMAND LINE OPTIONS =over 4 =item --badwordfile filename must contain a list of words not to put in the thesaurus. Each word must appear on its own line. Currently comments are not supported. =item --keywordfile File containing the raw word data for the thesaurus. Each line must be comma seperated list of related keywords. =item --outputdb file to write the LON-CAPA thesaurus database to. =item --help Display this help message and exit. =item --test Run a few test lookups after writing the database. =back The following example shows the default values for each parameter build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt =cut ## ## Get command line parameters ## my ($badwordfile,$outputdbfile,$keywordfile,$help,$checkdates,$test); GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile "outputdb=s" => \$outputdbfile, # --outputdb "keywordfile=s" => \$keywordfile, # --keywordfile "help" => \$help, # --help "checkdates" => \$checkdates, # --checkdates "test" => \$test); # --test ## ## Help! Help! ## if ($help) { print < filename must contain a list of words not to put in the thesaurus. Each word must appear on its own line and currently comments are not supported. --checkdates Check the creation dates on the files involved and only run if the outputdb file was created prior to one of the badword or keyword files. --keywordfile File containing the raw word data for the thesaurus. Each line must be comma seperated list of related keywords. --outputdb file to write the LON-CAPA thesaurus database to. --help Display this help message and exit. --test Run a few test lookups after writing the database. The following example shows the default values for each parameter build_thesaurus_db.pl --badwordfile ./un_keyword.tab \ --outputdb ./thesaurus.db --keywordfile rawkey.txt ENDHELP exit; } ## ## Set up defaults for parameters and check validity ## $badwordfile = $badwordfile || "./un_keyword.tab"; $outputdbfile = $outputdbfile || "./thesaurus.db"; $keywordfile = $keywordfile || "./rawkey.txt"; foreach my $file ($badwordfile,$keywordfile) { die "$file does not exist." if (! -e $file); } # # Check the dates on the input files to be sure we need to run if ($checkdates && -s $outputdbfile) { my @Results = stat($badwordfile); my $highest_dependency_ctime = $Results[10]; foreach ($keywordfile) { if ($Results[10] > $highest_dependency_ctime) { $highest_dependency_ctime = $Results[10]; } } # # if the outputdbfile was made AFTER the last version of one of the # dependencies, exit quietly. @Results = stat($outputdbfile); if ($highest_dependency_ctime < $Results[10]) { exit; } } ## ## Global hashes. ## my %wordcount = (); # Holds the number of times each word appears in the # input file. my %related_words=(); # Holds the words related to a word. The keys of this # has are words, and the values are pointers to hashes # which hold the words and their frequencies. my %isbad; # Holds an entry for each keyword that is 'bad' ## ## Initialize hash of bad words. 'bad' meaning their appearance in a keyword ## list does not add information. Not 'bad' meaning profane. ## open BAD,$badwordfile || die "Unable to open ".$badwordfile; while () { chomp; $isbad{lc($_)}++; } close BAD; ## ## Read in the data file and construction related words hash. Skip bad words. ## open(IN,$keywordfile) || die "Unable to open ".$keywordfile; while () { chomp; my @Words = split(/\W+/,lc($_)); foreach my $keyword (@Words) { next if ($isbad{$keyword}); $wordcount{$keyword}++; foreach my $otherword (@Words) { next if (($otherword eq $keyword) || ($isbad{$otherword})); $related_words{$keyword}->{$otherword}++; } } } close(IN); ## ## Determine average number of entries ## my $totalcount; foreach (keys(%wordcount)) { $totalcount+=$wordcount{$_}; } my $avecount = $totalcount /(scalar keys(%wordcount)); ## ## Make sure we can write the database. ## if (-e $outputdbfile) { die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile); } my %thesaurus_db; if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) { die "Error opening DB file.\n"; } ## ## Write the database file ## foreach my $word (keys(%related_words)) { next if (! defined($word)); my $result = &get_related($word); $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result); } ## ## Store away special values (must contain characters not matched by \w) ## $thesaurus_db{'average.count'}=$avecount; $thesaurus_db{'total.count'}=$totalcount; untie %thesaurus_db; ## ## Perform test lookups ## if ($test) { if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) { die "Error opening DB file.\n"; } foreach my $word ('torque','rna','polymerase') { my $result = $thesaurus_db{$word}; print "Results for $word = $result\n" if ($result); } untie %thesaurus_db; } ################################################################ ################################################################ # # get_related($keyword) is a utility function which will return a string # of the format: # keyword1,frequency1:keyword2,frequency2:..... # # 'frequency1' is the number of times the keyword1 appears in a keywords # list with $keyword. # sub get_related { my $keyword = shift; return undef if ((! $keyword) ||(! exists($related_words{$keyword}))); my %related_hash = %{$related_words{$keyword}}; my @Related_words = keys(%{$related_words{$keyword}}); @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } @Related_words; my $result; foreach (@Related_words) { $result .= "$_,$related_hash{$_}:"; } chop $result; return $result; }