[LON-CAPA-cvs] cvs: loncom /interface loncommon.pm lonsearchcat.pm
matthew
lon-capa-cvs@mail.lon-capa.org
Fri, 12 Jul 2002 14:36:16 -0000
This is a MIME encoded message
--matthew1026484576
Content-Type: text/plain
matthew Fri Jul 12 10:36:16 2002 EDT
Modified files:
/loncom/interface lonsearchcat.pm loncommon.pm
Log:
loncommon.pm:
Removed old thesaurus code, including old global variables.
Added two global variables:
%Keywords is a hash of words considered 'keywords' by the thesaurus.
$thesaurus_db_file holds the path to the thesaurus database file.
Removed initialization of old thesaurus variables from BEGIN block and added
initialization of new variables.
Added:
&get_related_words($keyword), which will return words related to $keyword.
&initialize_keywords(), which initializes the %Keywords hash on demand.
Replaced:
&keyword() now uses the %Keywords hash, after initializing it.
lonsearchcat.pm:
Added a checkbox on the advanced search to 'use related words', and
code to add the words to the users query. This support is preliminary
and will change.
--matthew1026484576
Content-Type: text/plain
Content-Disposition: attachment; filename="matthew-20020712103616.txt"
Index: loncom/interface/lonsearchcat.pm
diff -u loncom/interface/lonsearchcat.pm:1.140 loncom/interface/lonsearchcat.pm:1.141
--- loncom/interface/lonsearchcat.pm:1.140 Tue Jul 9 13:27:11 2002
+++ loncom/interface/lonsearchcat.pm Fri Jul 12 10:36:16 2002
@@ -1,7 +1,7 @@
# The LearningOnline Network with CAPA
# Search Catalog
#
-# $Id: lonsearchcat.pm,v 1.140 2002/07/09 17:27:11 matthew Exp $
+# $Id: lonsearchcat.pm,v 1.141 2002/07/12 14:36:16 matthew Exp $
#
# Copyright Michigan State University Board of Trustees
#
@@ -318,8 +318,11 @@
' ';
# $scrout.=&simplecheckbox('allversions',$ENV{'form.allversions'});
# $scrout.='<font color="#800000">Search historic archives</font>';
+ my $checkbox = &simplecheckbox('related',$ENV{'form.related'});
$scrout.=<<END;
-</td><td><a href="/adm/searchcat?reqinterface=advanced">Advanced Search</a></td></tr></table>
+</td><td><a href="/adm/searchcat?reqinterface=advanced">Advanced Search</a></td></tr>
+<tr><td>$checkbox use related words</td><td></td></tr>
+</table>
</p>
<p>
<input type="submit" name="basicsubmit" value='SEARCH' />
@@ -927,7 +930,18 @@
&output_blank_field_error($r);
return OK;
}
-
+ if ($ENV{'form.related'}) {
+ my $tmp = $ENV{'form.basicexp'};
+ while ($ENV{'form.basicexp'} =~ /(\w+)/cg) {
+ my $word = $1;
+ my @Words = &Apache::loncommon::get_related_words($word);
+ my $replacement = join " OR ", ($word,
+ ($#Words>4? @Words[0..4] : @Words)
+ );
+ $tmp =~ s/\b$word\b/ $replacement /g;
+ }
+ $ENV{'form.basicexp'} = $tmp;
+ }
# Build SQL query string based on form page
my $query='';
my $concatarg=join('," ",',
Index: loncom/interface/loncommon.pm
diff -u loncom/interface/loncommon.pm:1.45 loncom/interface/loncommon.pm:1.46
--- loncom/interface/loncommon.pm:1.45 Tue Jul 9 13:15:58 2002
+++ loncom/interface/loncommon.pm Fri Jul 12 10:36:16 2002
@@ -1,7 +1,7 @@
# The LearningOnline Network with CAPA
# a pile of common routines
#
-# $Id: loncommon.pm,v 1.45 2002/07/09 17:15:58 matthew Exp $
+# $Id: loncommon.pm,v 1.46 2002/07/12 14:36:16 matthew Exp $
#
# Copyright Michigan State University Board of Trustees
#
@@ -79,26 +79,48 @@
use strict;
use Apache::lonnet();
+use GDBM_File;
use POSIX qw(strftime);
use Apache::Constants qw(:common);
use Apache::lonmsg();
my $readit;
+=pod
+
+=item Global Variables
+
+=over 4
+
+=cut
# ----------------------------------------------- Filetypes/Languages/Copyright
my %language;
my %cprtag;
my %fe; my %fd;
my %category_extensions;
-# -------------------------------------------------------------- Thesaurus data
-my @therelated;
-my @theword;
-my @thecount;
-my %theindex;
-my $thetotalcount;
-my $thefuzzy=2;
-my $thethreshold=0.1/$thefuzzy;
-my $theavecount;
+# ---------------------------------------------- Thesaurus variables
+
+=pod
+
+=item %Keywords
+
+A hash used by &keyword to determine if a word is considered a keyword.
+
+=item $thesaurus_db_file
+
+Scalar containing the full path to the thesaurus database.
+
+=cut
+
+my %Keywords;
+my $thesaurus_db_file;
+
+
+=pod
+
+=back
+
+=cut
# ----------------------------------------------------------------------- BEGIN
@@ -114,7 +136,9 @@
# ----------------------------------------------------------------------- BEGIN
BEGIN {
-
+ # Variable initialization
+ $thesaurus_db_file = $Apache::lonnet::perlvar{'lonTabDir'}."/thesaurus.db";
+ #
unless ($readit) {
# ------------------------------------------------------------------- languages
{
@@ -171,26 +195,10 @@
}
}
}
-# -------------------------------------------------------------- Thesaurus data
- {
- my $fh=Apache::File->new($Apache::lonnet::perlvar{'lonTabDir'}.
- '/thesaurus.dat');
- if ($fh) {
- while (<$fh>) {
- my ($tword,$tindex,$tcount,$trelated)=split(/\@/,$_);
- $theindex{$tword}=$tindex;
- $theword[$tindex]=$tword;
- $thecount[$tindex]=$tcount;
- $thetotalcount+=$tcount;
- $therelated[$tindex]=$trelated;
- }
- }
- $theavecount=$thetotalcount/$#thecount;
- }
&Apache::lonnet::logthis(
- "<font color=yellow>INFO: Read file types and thesaurus</font>");
+ "<font color=yellow>INFO: Read file types</font>");
$readit=1;
-}
+ } # end of unless($readit)
}
# ============================================================= END BEGIN BLOCK
@@ -490,8 +498,6 @@
}
###############################################################
-
-###############################################################
## Home server <option> list generating code ##
###############################################################
#-------------------------------------------
@@ -805,65 +811,165 @@
## End Authentication changing form generation functions ##
###############################################################
+###############################################################
+## Thesaurus Functions ##
+###############################################################
+=pod
-# ---------------------------------------------------------- Is this a keyword?
+=item initialize_keywords
-sub keyword {
- my $newword=shift;
- $newword=~s/\W//g;
- $newword=~tr/A-Z/a-z/;
- my $tindex=$theindex{$newword};
- if ($tindex) {
- if ($thecount[$tindex]>$theavecount) {
- return 1;
- }
+Initializes the package variable %Keywords if it is empty. Uses the
+package variable $thesaurus_db_file.
+
+=cut
+
+###################################################
+
+sub initialize_keywords {
+ return 1 if (scalar keys(%Keywords));
+ # If we are here, %Keywords is empty, so fill it up
+ # Make sure the file we need exists...
+ if (! -e $thesaurus_db_file) {
+ &Apache::lonnet::logthis("Attempt to access $thesaurus_db_file".
+ " failed because it does not exist");
+ return 0;
+ }
+ # Set up the hash as a database
+ my %thesaurus_db;
+ if (! tie(%thesaurus_db,'GDBM_File',
+ $thesaurus_db_file,&GDBM_READER,0640)){
+ &Apache::lonnet::logthis("Could not tie \%thesaurus_db to ".
+ $thesaurus_db_file);
+ return 0;
+ }
+ # Get the average number of appearances of a word.
+ my $avecount = $thesaurus_db{'average.count'};
+ # Put keywords (those that appear > average) into %Keywords
+ while (my ($word,$data)=each (%thesaurus_db)) {
+ my ($count,undef) = split /:/,$data;
+ $Keywords{$word}++ if ($count > $avecount);
+ }
+ untie %thesaurus_db;
+ # Remove special values from %Keywords.
+ foreach ('total.count','average.count') {
+ delete($Keywords{$_}) if (exists($Keywords{$_}));
}
- return 0;
+ return 1;
+}
+
+###################################################
+
+=pod
+
+=item keyword($word)
+
+Returns true if $word is a keyword. A keyword is a word that appears more
+than the average number of times in the thesaurus database. Calls
+&initialize_keywords
+
+=cut
+
+###################################################
+
+sub keyword {
+ return if (!&initialize_keywords());
+ my $word=lc(shift());
+ $word=~s/\W//g;
+ return exists($Keywords{$word});
}
+
+###################################################
+# Old code, to be removed soon #
+###################################################
# -------------------------------------------------------- Return related words
+#sub related {
+# my $newword=shift;
+# $newword=~s/\W//g;
+# $newword=~tr/A-Z/a-z/;
+# my $tindex=$theindex{$newword};
+# if ($tindex) {
+# my %found=();
+# foreach (split(/\,/,$therelated[$tindex])) {
+## - Related word found
+# my ($ridx,$rcount)=split(/\:/,$_);
+## - Direct relation index
+# my $directrel=$rcount/$thecount[$tindex];
+# if ($directrel>$thethreshold) {
+# foreach (split(/\,/,$therelated[$ridx])) {
+# my ($rridx,$rrcount)=split(/\:/,$_);
+# if ($rridx==$tindex) {
+## - Determine reverse relation index
+# my $revrel=$rrcount/$thecount[$ridx];
+## - Calculate full index
+# $found{$ridx}=$directrel*$revrel;
+# if ($found{$ridx}>$thethreshold) {
+# foreach (split(/\,/,$therelated[$ridx])) {
+# my ($rrridx,$rrrcount)=split(/\:/,$_);
+# unless ($found{$rrridx}) {
+# my $revrevrel=$rrrcount/$thecount[$ridx];
+# if (
+# $directrel*$revrel*$revrevrel>$thethreshold
+# ) {
+# $found{$rrridx}=
+# $directrel*$revrel*$revrevrel;
+# }
+# }
+# }
+# }
+# }
+# }
+# }
+# }
+# }
+# return ();
+#}
-sub related {
- my $newword=shift;
- $newword=~s/\W//g;
- $newword=~tr/A-Z/a-z/;
- my $tindex=$theindex{$newword};
- if ($tindex) {
- my %found=();
- foreach (split(/\,/,$therelated[$tindex])) {
-# - Related word found
- my ($ridx,$rcount)=split(/\:/,$_);
-# - Direct relation index
- my $directrel=$rcount/$thecount[$tindex];
- if ($directrel>$thethreshold) {
- foreach (split(/\,/,$therelated[$ridx])) {
- my ($rridx,$rrcount)=split(/\:/,$_);
- if ($rridx==$tindex) {
-# - Determine reverse relation index
- my $revrel=$rrcount/$thecount[$ridx];
-# - Calculate full index
- $found{$ridx}=$directrel*$revrel;
- if ($found{$ridx}>$thethreshold) {
- foreach (split(/\,/,$therelated[$ridx])) {
- my ($rrridx,$rrrcount)=split(/\:/,$_);
- unless ($found{$rrridx}) {
- my $revrevrel=$rrrcount/$thecount[$ridx];
- if (
- $directrel*$revrel*$revrevrel>$thethreshold
- ) {
- $found{$rrridx}=
- $directrel*$revrel*$revrevrel;
- }
- }
- }
- }
- }
- }
- }
+###############################################################
+
+=pod
+
+=item get_related_words
+
+Look up a word in the thesaurus. Takes a scalar arguement and returns
+an array of words. If the keyword is not in the thesaurus, an empty array
+will be returned. The order of the words returned is determined by the
+database which holds them.
+
+Uses global $thesaurus_db_file.
+
+=cut
+
+###############################################################
+
+sub get_related_words {
+ my $keyword = shift;
+ my %thesaurus_db;
+ if (! -e $thesaurus_db_file) {
+ &Apache::lonnet::logthis("Attempt to access $thesaurus_db_file ".
+ "failed because the file does not exist");
+ return ();
+ }
+ if (! tie(%thesaurus_db,'GDBM_File',
+ $thesaurus_db_file,&GDBM_READER,0640)){
+ return ();
+ }
+ my @Words=();
+ if (exists($thesaurus_db{$keyword})) {
+ $_ = $thesaurus_db{$keyword};
+ (undef,@Words) = split/:/; # The first element is the number of times
+ # the word appears. We do not need it now.
+ for (my $i=0;$i<=$#Words;$i++) {
+ ($Words[$i],undef)= split/\,/,$Words[$i];
}
}
- return ();
+ untie %thesaurus_db;
+ return @Words;
}
+
+###############################################################
+## End Thesaurus Functions ##
+###############################################################
# ---------------------------------------------------------------- Language IDs
sub languageids {
--matthew1026484576--