[LON-CAPA-cvs] cvs: loncom /thesaurus build_thesaurus_db.pl

Thu, 11 Jul 2002 20:48:31 -0000

This is a MIME encoded message

--matthew1026420511
Content-Type: text/plain

matthew		Thu Jul 11 16:48:31 2002 EDT

  Added files:                 
    /loncom/thesaurus	build_thesaurus_db.pl 
  Log:
  Script to build LON-CAPA thesaurus database.

--matthew1026420511
Content-Type: text/plain
Content-Disposition: attachment; filename="matthew-20020711164831.txt"

Index: loncom/thesaurus/build_thesaurus_db.pl
+++ loncom/thesaurus/build_thesaurus_db.pl
#!/usr/bin/perl -w
#
# $Id: build_thesaurus_db.pl,v 1.1 2002/07/11 20:48:31 matthew Exp $
#
#
# build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
use strict;
use Getopt::Long;
use GDBM_File;
# POD required stuff:

=pod

=head1 NAME

build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.

=head1 SYNOPSIS

build_thesaurus_db.pl creates the LON-CAPA thesaurus database.

=head1 DESCRIPTION

build_thesaurus_db.pl reads two input files.  The first is a list of words to
omit from the thesaurus.  The second is the raw keyword data for the thesaurus.
>From this file a database is built.

=head1 DATABASE FORMAT DESCRIPTION

The structure of the database entries is described below.  

=head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!

Allow me to repeat myself:

=head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!

Got it?  While you are reading this, let me encourage you to document
any changes to the structure of the database.  It is not that hard and
you will save much time if you do.  

That said, you should make sure the description below actually matches
the code, just to be safe.

This concludes the lecture portion of the comments.

=head1 DATABASE FORMAT DESCRIPTION

An entry in the database for a given word is shown below:

 polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
              |   |  |
              |   |  The number of times dna appeared in a keywords list
              |   |  with the word polymerase. 
              |   The related keyword
              The number of times polymerase appeared in a keywords list.

Note: the related words list will be in descending order of occurance with 
the keyword.

=head1 COMMAND LINE OPTIONS

=over 4

=item --badwordfile <filename>

filename must contain a list of words not to put in the thesaurus.  
Each word must appear on its own line.
Currently comments are not supported.

=item --keywordfile <filename>

File containing the raw word data for the thesaurus.  Each line must be 
comma seperated list of related keywords.

=item --outputdb <filename>

file to write the LON-CAPA thesaurus database to.

=item --help

Display this help message and exit.

=item --test

Run a few test lookups after writing the database.

=back

The following example shows the default values for each parameter

build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt

=cut

##
## Get command line parameters
##
my ($badwordfile,$outputdbfile,$keywordfile,$help,$test);
GetOptions( "badwordfile=s" => \$badwordfile,   # --badwordfile
            "outputdb=s"    => \$outputdbfile,  # --outputdb
            "keywordfile=s" => \$keywordfile,   # --keywordfile
            "help"          => \$help,          # --help
            "test"          => \$test);         # --test

##
## Help! Help!
##
if ($help) {
    print <<ENDHELP;
build_thesaurus_db.pl     Build a LON-CAPA thesaurus database.

Command line arguements
   --badwordfile <filename>     filename must contain a list of words not to
                                put in the thesaurus.  Each word must appear
                                on its own line and currently comments are not
                                supported.
   --keywordfile <filename>     File containing the raw word data for the
                                thesaurus.  Each line must be comma seperated
                                list of related keywords.
   --outputdb <filename>        file to write the LON-CAPA thesaurus database
                                to.
   --help                       Display this help message and exit.
   --test                       Run a few test lookups after writing the 
                                database.
The following example shows the default values for each parameter

build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
     --outputdb ./thesaurus.db --keywordfile rawkey.txt

ENDHELP
    exit;
}

##
## Set up defaults for parameters and check validity
##
$badwordfile  = $badwordfile  || "./un_keyword.tab";
$outputdbfile = $outputdbfile || "./thesaurus.db";
$keywordfile  = $keywordfile  || "./rawkey.txt";

foreach my $file ($badwordfile,$keywordfile) {
    die "$file does not exist." if (! -e $file);
}

##
## Global hashes.
##
my %wordcount = ();    # Holds the number of times each word appears in the
                       # input file.
my %related_words=();  # Holds the words related to a word.  The keys of this
                       # has are words, and the values are pointers to hashes
                       # which hold the words and their frequencies.
my %isbad;             # Holds an entry for each keyword that is 'bad'

##
## Initialize hash of bad words.  'bad' meaning their appearance in a keyword
## list does not add information.  Not 'bad' meaning profane.  
##
open BAD,$badwordfile || die "Unable to open ".$badwordfile;
while (<BAD>) {
    chomp;
    $isbad{lc($_)}++;
}
close BAD;

##
## Read in the data file and construction related words hash.  Skip bad words.
##
open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
while (<IN>) {
    chomp;
    my @Words = split(/\W+/,lc($_));
    foreach my $keyword (@Words) {
        next if ($isbad{$keyword});
        $wordcount{$keyword}++;
        foreach my $otherword (@Words) {
            next if (($otherword eq $keyword) || ($isbad{$otherword}));
            $related_words{$keyword}->{$otherword}++;
        }
    }
}
close(IN);

##
## Determine average number of entries
##
my $totalcount;
foreach (keys(%wordcount)) {
    $totalcount+=$wordcount{$_};
}
my $avecount = $totalcount /(scalar keys(%wordcount));

##
## Make sure we can write the database.
##
if (-e $outputdbfile) {
    die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
}
my %thesaurus_db;
if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
    die "Error opening DB file.\n";
}

##
## Write the database file
##
foreach my $word (keys(%related_words)) {
    next if (! defined($word));
    my $result = &get_related($word);
    $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
}

##
## Store away special values (must contain characters not matched by \w)
##
$thesaurus_db{'average.count'}=$avecount;
$thesaurus_db{'total.count'}=$totalcount;
untie %thesaurus_db;

##
## Perform test lookups
##
if ($test) {
    if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
        die "Error opening DB file.\n";
    }
    foreach my $word ('torque','rna','polymerase') {
        my $result = $thesaurus_db{$word};
        print "Results for $word = $result\n" if ($result);
    }
    untie %thesaurus_db;
}

################################################################
################################################################
#
# get_related($keyword) is a utility function which will return a string
#     of the format: 
#        keyword1,frequency1:keyword2,frequency2:.....
#
#     'frequency1' is the number of times the keyword1 appears in a keywords
#     list with $keyword.
#
sub get_related {
    my $keyword = shift;
    return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
    my %related_hash = %{$related_words{$keyword}};
    my @Related_words = keys(%{$related_words{$keyword}});
    @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } 
                          @Related_words;
    my $result;
    foreach (@Related_words) {
        $result .= "$_,$related_hash{$_}:";
    }
    chop $result;
    return $result;
}

--matthew1026420511--