File:
[LON-CAPA] /
loncom /
thesaurus /
build_thesaurus_db.pl
Revision
1.1:
download - view:
text,
annotated -
select for diffs
Thu Jul 11 20:48:31 2002 UTC (22 years, 2 months ago) by
matthew
Branches:
MAIN
CVS tags:
version_1_0_3,
version_1_0_2,
version_1_0_1,
version_1_0_0,
version_0_99_5,
version_0_99_4,
version_0_99_3,
version_0_99_2,
version_0_99_1,
version_0_99_0,
version_0_6_2,
version_0_6,
version_0_5_1,
version_0_5,
conference_2003,
HEAD
Script to build LON-CAPA thesaurus database.
1: #!/usr/bin/perl -w
2: #
3: # $Id: build_thesaurus_db.pl,v 1.1 2002/07/11 20:48:31 matthew Exp $
4: #
5: #
6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
7: #
8: # Copyright Michigan State University Board of Trustees
9: #
10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
11: #
12: # LON-CAPA is free software; you can redistribute it and/or modify
13: # it under the terms of the GNU General Public License as published by
14: # the Free Software Foundation; either version 2 of the License, or
15: # (at your option) any later version.
16: #
17: # LON-CAPA is distributed in the hope that it will be useful,
18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20: # GNU General Public License for more details.
21: #
22: # You should have received a copy of the GNU General Public License
23: # along with LON-CAPA; if not, write to the Free Software
24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25: #
26: # /home/httpd/html/adm/gpl.txt
27: #
28: # http://www.lon-capa.org/
29: #
30: use strict;
31: use Getopt::Long;
32: use GDBM_File;
33: # POD required stuff:
34:
35: =pod
36:
37: =head1 NAME
38:
39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
40:
41: =head1 SYNOPSIS
42:
43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
44:
45: =head1 DESCRIPTION
46:
47: build_thesaurus_db.pl reads two input files. The first is a list of words to
48: omit from the thesaurus. The second is the raw keyword data for the thesaurus.
49: From this file a database is built.
50:
51: =head1 DATABASE FORMAT DESCRIPTION
52:
53: The structure of the database entries is described below.
54:
55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
56:
57: Allow me to repeat myself:
58:
59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
60:
61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
62:
63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
64:
65: Got it? While you are reading this, let me encourage you to document
66: any changes to the structure of the database. It is not that hard and
67: you will save much time if you do.
68:
69: That said, you should make sure the description below actually matches
70: the code, just to be safe.
71:
72: This concludes the lecture portion of the comments.
73:
74: =head1 DATABASE FORMAT DESCRIPTION
75:
76: An entry in the database for a given word is shown below:
77:
78: polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
79: | | |
80: | | The number of times dna appeared in a keywords list
81: | | with the word polymerase.
82: | The related keyword
83: The number of times polymerase appeared in a keywords list.
84:
85: Note: the related words list will be in descending order of occurance with
86: the keyword.
87:
88: =head1 COMMAND LINE OPTIONS
89:
90: =over 4
91:
92:
93: =item --badwordfile <filename>
94:
95: filename must contain a list of words not to put in the thesaurus.
96: Each word must appear on its own line.
97: Currently comments are not supported.
98:
99: =item --keywordfile <filename>
100:
101: File containing the raw word data for the thesaurus. Each line must be
102: comma seperated list of related keywords.
103:
104: =item --outputdb <filename>
105:
106: file to write the LON-CAPA thesaurus database to.
107:
108: =item --help
109:
110: Display this help message and exit.
111:
112: =item --test
113:
114: Run a few test lookups after writing the database.
115:
116: =back
117:
118: The following example shows the default values for each parameter
119:
120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
121:
122: =cut
123:
124: ##
125: ## Get command line parameters
126: ##
127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$test);
128: GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile
129: "outputdb=s" => \$outputdbfile, # --outputdb
130: "keywordfile=s" => \$keywordfile, # --keywordfile
131: "help" => \$help, # --help
132: "test" => \$test); # --test
133:
134: ##
135: ## Help! Help!
136: ##
137: if ($help) {
138: print <<ENDHELP;
139: build_thesaurus_db.pl Build a LON-CAPA thesaurus database.
140:
141: Command line arguements
142: --badwordfile <filename> filename must contain a list of words not to
143: put in the thesaurus. Each word must appear
144: on its own line and currently comments are not
145: supported.
146: --keywordfile <filename> File containing the raw word data for the
147: thesaurus. Each line must be comma seperated
148: list of related keywords.
149: --outputdb <filename> file to write the LON-CAPA thesaurus database
150: to.
151: --help Display this help message and exit.
152: --test Run a few test lookups after writing the
153: database.
154: The following example shows the default values for each parameter
155:
156: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
157: --outputdb ./thesaurus.db --keywordfile rawkey.txt
158:
159: ENDHELP
160: exit;
161: }
162:
163: ##
164: ## Set up defaults for parameters and check validity
165: ##
166: $badwordfile = $badwordfile || "./un_keyword.tab";
167: $outputdbfile = $outputdbfile || "./thesaurus.db";
168: $keywordfile = $keywordfile || "./rawkey.txt";
169:
170: foreach my $file ($badwordfile,$keywordfile) {
171: die "$file does not exist." if (! -e $file);
172: }
173:
174: ##
175: ## Global hashes.
176: ##
177: my %wordcount = (); # Holds the number of times each word appears in the
178: # input file.
179: my %related_words=(); # Holds the words related to a word. The keys of this
180: # has are words, and the values are pointers to hashes
181: # which hold the words and their frequencies.
182: my %isbad; # Holds an entry for each keyword that is 'bad'
183:
184: ##
185: ## Initialize hash of bad words. 'bad' meaning their appearance in a keyword
186: ## list does not add information. Not 'bad' meaning profane.
187: ##
188: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
189: while (<BAD>) {
190: chomp;
191: $isbad{lc($_)}++;
192: }
193: close BAD;
194:
195: ##
196: ## Read in the data file and construction related words hash. Skip bad words.
197: ##
198: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
199: while (<IN>) {
200: chomp;
201: my @Words = split(/\W+/,lc($_));
202: foreach my $keyword (@Words) {
203: next if ($isbad{$keyword});
204: $wordcount{$keyword}++;
205: foreach my $otherword (@Words) {
206: next if (($otherword eq $keyword) || ($isbad{$otherword}));
207: $related_words{$keyword}->{$otherword}++;
208: }
209: }
210: }
211: close(IN);
212:
213: ##
214: ## Determine average number of entries
215: ##
216: my $totalcount;
217: foreach (keys(%wordcount)) {
218: $totalcount+=$wordcount{$_};
219: }
220: my $avecount = $totalcount /(scalar keys(%wordcount));
221:
222: ##
223: ## Make sure we can write the database.
224: ##
225: if (-e $outputdbfile) {
226: die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
227: }
228: my %thesaurus_db;
229: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
230: die "Error opening DB file.\n";
231: }
232:
233: ##
234: ## Write the database file
235: ##
236: foreach my $word (keys(%related_words)) {
237: next if (! defined($word));
238: my $result = &get_related($word);
239: $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
240: }
241:
242: ##
243: ## Store away special values (must contain characters not matched by \w)
244: ##
245: $thesaurus_db{'average.count'}=$avecount;
246: $thesaurus_db{'total.count'}=$totalcount;
247: untie %thesaurus_db;
248:
249: ##
250: ## Perform test lookups
251: ##
252: if ($test) {
253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
254: die "Error opening DB file.\n";
255: }
256: foreach my $word ('torque','rna','polymerase') {
257: my $result = $thesaurus_db{$word};
258: print "Results for $word = $result\n" if ($result);
259: }
260: untie %thesaurus_db;
261: }
262:
263:
264: ################################################################
265: ################################################################
266: #
267: # get_related($keyword) is a utility function which will return a string
268: # of the format:
269: # keyword1,frequency1:keyword2,frequency2:.....
270: #
271: # 'frequency1' is the number of times the keyword1 appears in a keywords
272: # list with $keyword.
273: #
274: sub get_related {
275: my $keyword = shift;
276: return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
277: my %related_hash = %{$related_words{$keyword}};
278: my @Related_words = keys(%{$related_words{$keyword}});
279: @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} }
280: @Related_words;
281: my $result;
282: foreach (@Related_words) {
283: $result .= "$_,$related_hash{$_}:";
284: }
285: chop $result;
286: return $result;
287: }
288:
289:
290:
291:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>