[LON-CAPA-cvs] cvs: loncom /thesaurus build_thesaurus_db.pl
matthew
lon-capa-cvs@mail.lon-capa.org
Thu, 11 Jul 2002 20:48:31 -0000
This is a MIME encoded message
--matthew1026420511
Content-Type: text/plain
matthew Thu Jul 11 16:48:31 2002 EDT
Added files:
/loncom/thesaurus build_thesaurus_db.pl
Log:
Script to build LON-CAPA thesaurus database.
--matthew1026420511
Content-Type: text/plain
Content-Disposition: attachment; filename="matthew-20020711164831.txt"
Index: loncom/thesaurus/build_thesaurus_db.pl
+++ loncom/thesaurus/build_thesaurus_db.pl
#!/usr/bin/perl -w
#
# $Id: build_thesaurus_db.pl,v 1.1 2002/07/11 20:48:31 matthew Exp $
#
#
# build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
use strict;
use Getopt::Long;
use GDBM_File;
# POD required stuff:
=pod
=head1 NAME
build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
=head1 SYNOPSIS
build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
=head1 DESCRIPTION
build_thesaurus_db.pl reads two input files. The first is a list of words to
omit from the thesaurus. The second is the raw keyword data for the thesaurus.
>From this file a database is built.
=head1 DATABASE FORMAT DESCRIPTION
The structure of the database entries is described below.
=head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
Allow me to repeat myself:
=head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
=head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
=head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
Got it? While you are reading this, let me encourage you to document
any changes to the structure of the database. It is not that hard and
you will save much time if you do.
That said, you should make sure the description below actually matches
the code, just to be safe.
This concludes the lecture portion of the comments.
=head1 DATABASE FORMAT DESCRIPTION
An entry in the database for a given word is shown below:
polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
| | |
| | The number of times dna appeared in a keywords list
| | with the word polymerase.
| The related keyword
The number of times polymerase appeared in a keywords list.
Note: the related words list will be in descending order of occurance with
the keyword.
=head1 COMMAND LINE OPTIONS
=over 4
=item --badwordfile <filename>
filename must contain a list of words not to put in the thesaurus.
Each word must appear on its own line.
Currently comments are not supported.
=item --keywordfile <filename>
File containing the raw word data for the thesaurus. Each line must be
comma seperated list of related keywords.
=item --outputdb <filename>
file to write the LON-CAPA thesaurus database to.
=item --help
Display this help message and exit.
=item --test
Run a few test lookups after writing the database.
=back
The following example shows the default values for each parameter
build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
=cut
##
## Get command line parameters
##
my ($badwordfile,$outputdbfile,$keywordfile,$help,$test);
GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile
"outputdb=s" => \$outputdbfile, # --outputdb
"keywordfile=s" => \$keywordfile, # --keywordfile
"help" => \$help, # --help
"test" => \$test); # --test
##
## Help! Help!
##
if ($help) {
print <<ENDHELP;
build_thesaurus_db.pl Build a LON-CAPA thesaurus database.
Command line arguements
--badwordfile <filename> filename must contain a list of words not to
put in the thesaurus. Each word must appear
on its own line and currently comments are not
supported.
--keywordfile <filename> File containing the raw word data for the
thesaurus. Each line must be comma seperated
list of related keywords.
--outputdb <filename> file to write the LON-CAPA thesaurus database
to.
--help Display this help message and exit.
--test Run a few test lookups after writing the
database.
The following example shows the default values for each parameter
build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
--outputdb ./thesaurus.db --keywordfile rawkey.txt
ENDHELP
exit;
}
##
## Set up defaults for parameters and check validity
##
$badwordfile = $badwordfile || "./un_keyword.tab";
$outputdbfile = $outputdbfile || "./thesaurus.db";
$keywordfile = $keywordfile || "./rawkey.txt";
foreach my $file ($badwordfile,$keywordfile) {
die "$file does not exist." if (! -e $file);
}
##
## Global hashes.
##
my %wordcount = (); # Holds the number of times each word appears in the
# input file.
my %related_words=(); # Holds the words related to a word. The keys of this
# has are words, and the values are pointers to hashes
# which hold the words and their frequencies.
my %isbad; # Holds an entry for each keyword that is 'bad'
##
## Initialize hash of bad words. 'bad' meaning their appearance in a keyword
## list does not add information. Not 'bad' meaning profane.
##
open BAD,$badwordfile || die "Unable to open ".$badwordfile;
while (<BAD>) {
chomp;
$isbad{lc($_)}++;
}
close BAD;
##
## Read in the data file and construction related words hash. Skip bad words.
##
open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
while (<IN>) {
chomp;
my @Words = split(/\W+/,lc($_));
foreach my $keyword (@Words) {
next if ($isbad{$keyword});
$wordcount{$keyword}++;
foreach my $otherword (@Words) {
next if (($otherword eq $keyword) || ($isbad{$otherword}));
$related_words{$keyword}->{$otherword}++;
}
}
}
close(IN);
##
## Determine average number of entries
##
my $totalcount;
foreach (keys(%wordcount)) {
$totalcount+=$wordcount{$_};
}
my $avecount = $totalcount /(scalar keys(%wordcount));
##
## Make sure we can write the database.
##
if (-e $outputdbfile) {
die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
}
my %thesaurus_db;
if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
die "Error opening DB file.\n";
}
##
## Write the database file
##
foreach my $word (keys(%related_words)) {
next if (! defined($word));
my $result = &get_related($word);
$thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
}
##
## Store away special values (must contain characters not matched by \w)
##
$thesaurus_db{'average.count'}=$avecount;
$thesaurus_db{'total.count'}=$totalcount;
untie %thesaurus_db;
##
## Perform test lookups
##
if ($test) {
if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
die "Error opening DB file.\n";
}
foreach my $word ('torque','rna','polymerase') {
my $result = $thesaurus_db{$word};
print "Results for $word = $result\n" if ($result);
}
untie %thesaurus_db;
}
################################################################
################################################################
#
# get_related($keyword) is a utility function which will return a string
# of the format:
# keyword1,frequency1:keyword2,frequency2:.....
#
# 'frequency1' is the number of times the keyword1 appears in a keywords
# list with $keyword.
#
sub get_related {
my $keyword = shift;
return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
my %related_hash = %{$related_words{$keyword}};
my @Related_words = keys(%{$related_words{$keyword}});
@Related_words = sort {$related_hash{$b} <=> $related_hash{$a} }
@Related_words;
my $result;
foreach (@Related_words) {
$result .= "$_,$related_hash{$_}:";
}
chop $result;
return $result;
}
--matthew1026420511--