[LON-CAPA-cvs] cvs: loncom /interface loncommon.pm lonsearchcat.pm

Fri, 12 Jul 2002 14:36:16 -0000

This is a MIME encoded message

--matthew1026484576
Content-Type: text/plain

matthew		Fri Jul 12 10:36:16 2002 EDT

  Modified files:              
    /loncom/interface	lonsearchcat.pm loncommon.pm 
  Log:
  loncommon.pm:
    Removed old thesaurus code, including old global variables.
    Added two global variables:
      %Keywords is a hash of words considered 'keywords' by the thesaurus.
      $thesaurus_db_file holds the path to the thesaurus database file.
    Removed initialization of old thesaurus variables from BEGIN block and added 
      initialization of new variables.
    Added:
      &get_related_words($keyword), which will return words related to $keyword.
      &initialize_keywords(), which initializes the %Keywords hash on demand.
    Replaced:
      &keyword() now uses the %Keywords hash, after initializing it.
  lonsearchcat.pm:
    Added a checkbox on the advanced search to 'use related words', and 
    code to add the words to the users query.  This support is preliminary
    and will change.
  
  
--matthew1026484576
Content-Type: text/plain
Content-Disposition: attachment; filename="matthew-20020712103616.txt"

Index: loncom/interface/lonsearchcat.pm
diff -u loncom/interface/lonsearchcat.pm:1.140 loncom/interface/lonsearchcat.pm:1.141

--- loncom/interface/lonsearchcat.pm:1.140	Tue Jul  9 13:27:11 2002
+++ loncom/interface/lonsearchcat.pm	Fri Jul 12 10:36:16 2002
@@ -1,7 +1,7 @@
 # The LearningOnline Network with CAPA
 # Search Catalog
 #
-# $Id: lonsearchcat.pm,v 1.140 2002/07/09 17:27:11 matthew Exp $
+# $Id: lonsearchcat.pm,v 1.141 2002/07/12 14:36:16 matthew Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -318,8 +318,11 @@
         '&nbsp;';
 #    $scrout.=&simplecheckbox('allversions',$ENV{'form.allversions'});
 #    $scrout.='<font color="#800000">Search historic archives</font>';
+    my $checkbox = &simplecheckbox('related',$ENV{'form.related'});
     $scrout.=<<END;
-</td><td><a href="/adm/searchcat?reqinterface=advanced">Advanced Search</a></td></tr></table>
+</td><td><a href="/adm/searchcat?reqinterface=advanced">Advanced Search</a></td></tr>
+<tr><td>$checkbox use related words</td><td></td></tr>
+</table>
 </p>
 <p>
 &nbsp;<input type="submit" name="basicsubmit" value='SEARCH' />&nbsp;
@@ -927,7 +930,18 @@
 	&output_blank_field_error($r);
 	return OK;
     }
-
+    if ($ENV{'form.related'}) {
+        my $tmp = $ENV{'form.basicexp'};
+        while ($ENV{'form.basicexp'} =~ /(\w+)/cg) {
+            my $word = $1;
+            my @Words = &Apache::loncommon::get_related_words($word);
+            my $replacement = join " OR ", ($word,
+                                            ($#Words>4? @Words[0..4] : @Words)
+                                            );
+            $tmp =~ s/\b$word\b/ $replacement /g;
+        }
+        $ENV{'form.basicexp'} = $tmp;
+    }
     # Build SQL query string based on form page
     my $query='';
     my $concatarg=join(',"    ",',
Index: loncom/interface/loncommon.pm
diff -u loncom/interface/loncommon.pm:1.45 loncom/interface/loncommon.pm:1.46
--- loncom/interface/loncommon.pm:1.45	Tue Jul  9 13:15:58 2002
+++ loncom/interface/loncommon.pm	Fri Jul 12 10:36:16 2002
@@ -1,7 +1,7 @@
 # The LearningOnline Network with CAPA
 # a pile of common routines
 #
-# $Id: loncommon.pm,v 1.45 2002/07/09 17:15:58 matthew Exp $
+# $Id: loncommon.pm,v 1.46 2002/07/12 14:36:16 matthew Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -79,26 +79,48 @@
 
 use strict;
 use Apache::lonnet();
+use GDBM_File;
 use POSIX qw(strftime);
 use Apache::Constants qw(:common);
 use Apache::lonmsg();
 my $readit;
 
+=pod 
+
+=item Global Variables
+
+=over 4
+
+=cut
 # ----------------------------------------------- Filetypes/Languages/Copyright
 my %language;
 my %cprtag;
 my %fe; my %fd;
 my %category_extensions;
 
-# -------------------------------------------------------------- Thesaurus data
-my @therelated;
-my @theword;
-my @thecount;
-my %theindex;
-my $thetotalcount;
-my $thefuzzy=2;
-my $thethreshold=0.1/$thefuzzy;
-my $theavecount;
+# ---------------------------------------------- Thesaurus variables
+
+=pod
+
+=item %Keywords  
+
+A hash used by &keyword to determine if a word is considered a keyword.
+
+=item $thesaurus_db_file
+
+Scalar containing the full path to the thesaurus database.                 
+
+=cut
+
+my %Keywords;
+my $thesaurus_db_file;
+
+
+=pod
+
+=back
+
+=cut
 
 # ----------------------------------------------------------------------- BEGIN
 
@@ -114,7 +136,9 @@
 # ----------------------------------------------------------------------- BEGIN
 
 BEGIN {
-
+    # Variable initialization
+    $thesaurus_db_file = $Apache::lonnet::perlvar{'lonTabDir'}."/thesaurus.db";
+    #
     unless ($readit) {
 # ------------------------------------------------------------------- languages
     {
@@ -171,26 +195,10 @@
 	    }
 	}
     }
-# -------------------------------------------------------------- Thesaurus data
-    {
-	my $fh=Apache::File->new($Apache::lonnet::perlvar{'lonTabDir'}.
-	       '/thesaurus.dat');
-	if ($fh) {
-            while (<$fh>) {
-               my ($tword,$tindex,$tcount,$trelated)=split(/\@/,$_);
-               $theindex{$tword}=$tindex;
-               $theword[$tindex]=$tword;
-               $thecount[$tindex]=$tcount;
-               $thetotalcount+=$tcount;
-               $therelated[$tindex]=$trelated;
-	   }
-        }
-        $theavecount=$thetotalcount/$#thecount;
-    }
     &Apache::lonnet::logthis(
-              "<font color=yellow>INFO: Read file types and thesaurus</font>");
+              "<font color=yellow>INFO: Read file types</font>");
     $readit=1;
-}
+    }  # end of unless($readit) 
     
 }
 # ============================================================= END BEGIN BLOCK
@@ -490,8 +498,6 @@
 }
 
 ###############################################################
-
-###############################################################
 ##        Home server <option> list generating code          ##
 ###############################################################
 #-------------------------------------------
@@ -805,65 +811,165 @@
 ##   End Authentication changing form generation functions   ##
 ###############################################################
 
+###############################################################
+##                Thesaurus Functions                        ##
+###############################################################
 
+=pod
 
-# ---------------------------------------------------------- Is this a keyword?
+=item initialize_keywords
 
-sub keyword {
-    my $newword=shift;
-    $newword=~s/\W//g;
-    $newword=~tr/A-Z/a-z/;
-    my $tindex=$theindex{$newword};
-    if ($tindex) {
-        if ($thecount[$tindex]>$theavecount) {
-           return 1;
-        }
+Initializes the package variable %Keywords if it is empty.  Uses the
+package variable $thesaurus_db_file.
+
+=cut
+
+###################################################
+
+sub initialize_keywords {
+    return 1 if (scalar keys(%Keywords));
+    # If we are here, %Keywords is empty, so fill it up
+    #   Make sure the file we need exists...
+    if (! -e $thesaurus_db_file) {
+        &Apache::lonnet::logthis("Attempt to access $thesaurus_db_file".
+                                 " failed because it does not exist");
+        return 0;
+    }
+    #   Set up the hash as a database
+    my %thesaurus_db;
+    if (! tie(%thesaurus_db,'GDBM_File',
+              $thesaurus_db_file,&GDBM_READER,0640)){
+        &Apache::lonnet::logthis("Could not tie \%thesaurus_db to ".
+                                 $thesaurus_db_file);
+        return 0;
+    } 
+    #  Get the average number of appearances of a word.
+    my $avecount = $thesaurus_db{'average.count'};
+    #  Put keywords (those that appear > average) into %Keywords
+    while (my ($word,$data)=each (%thesaurus_db)) {
+        my ($count,undef) = split /:/,$data;
+        $Keywords{$word}++ if ($count > $avecount);
+    }
+    untie %thesaurus_db;
+    # Remove special values from %Keywords.
+    foreach ('total.count','average.count') {
+        delete($Keywords{$_}) if (exists($Keywords{$_}));
     }
-    return 0;
+    return 1;
+}
+
+###################################################
+
+=pod
+
+=item keyword($word)
+
+Returns true if $word is a keyword.  A keyword is a word that appears more 
+than the average number of times in the thesaurus database.  Calls 
+&initialize_keywords
+
+=cut
+
+###################################################
+
+sub keyword {
+    return if (!&initialize_keywords());
+    my $word=lc(shift());
+    $word=~s/\W//g;
+    return exists($Keywords{$word});
 }
+
+###################################################
+#         Old code, to be removed soon            #
+###################################################
 # -------------------------------------------------------- Return related words
+#sub related {
+#    my $newword=shift;
+#    $newword=~s/\W//g;
+#    $newword=~tr/A-Z/a-z/;
+#    my $tindex=$theindex{$newword};
+#    if ($tindex) {
+#        my %found=();
+#        foreach (split(/\,/,$therelated[$tindex])) {
+## - Related word found
+#            my ($ridx,$rcount)=split(/\:/,$_);
+## - Direct relation index
+#            my $directrel=$rcount/$thecount[$tindex];
+#            if ($directrel>$thethreshold) {
+#               foreach (split(/\,/,$therelated[$ridx])) {
+#                  my ($rridx,$rrcount)=split(/\:/,$_);
+#                  if ($rridx==$tindex) {
+## - Determine reverse relation index
+#                     my $revrel=$rrcount/$thecount[$ridx];
+## - Calculate full index
+#                     $found{$ridx}=$directrel*$revrel;
+#                     if ($found{$ridx}>$thethreshold) {
+#                        foreach (split(/\,/,$therelated[$ridx])) {
+#                            my ($rrridx,$rrrcount)=split(/\:/,$_);
+#                            unless ($found{$rrridx}) {
+#                               my $revrevrel=$rrrcount/$thecount[$ridx];
+#                               if (
+#                          $directrel*$revrel*$revrevrel>$thethreshold
+#                               ) {
+#                                  $found{$rrridx}=
+#                                       $directrel*$revrel*$revrevrel;
+#                               }
+#                            }
+#                        }
+#                     }
+#                  }
+#               }
+#            }
+#        }
+#    }
+#    return ();
+#}
 
-sub related {
-    my $newword=shift;
-    $newword=~s/\W//g;
-    $newword=~tr/A-Z/a-z/;
-    my $tindex=$theindex{$newword};
-    if ($tindex) {
-        my %found=();
-        foreach (split(/\,/,$therelated[$tindex])) {
-# - Related word found
-            my ($ridx,$rcount)=split(/\:/,$_);
-# - Direct relation index
-            my $directrel=$rcount/$thecount[$tindex];
-            if ($directrel>$thethreshold) {
-               foreach (split(/\,/,$therelated[$ridx])) {
-                  my ($rridx,$rrcount)=split(/\:/,$_);
-                  if ($rridx==$tindex) {
-# - Determine reverse relation index
-                     my $revrel=$rrcount/$thecount[$ridx];
-# - Calculate full index
-                     $found{$ridx}=$directrel*$revrel;
-                     if ($found{$ridx}>$thethreshold) {
-                        foreach (split(/\,/,$therelated[$ridx])) {
-                            my ($rrridx,$rrrcount)=split(/\:/,$_);
-                            unless ($found{$rrridx}) {
-                               my $revrevrel=$rrrcount/$thecount[$ridx];
-                               if (
-                          $directrel*$revrel*$revrevrel>$thethreshold
-                               ) {
-                                  $found{$rrridx}=
-                                       $directrel*$revrel*$revrevrel;
-                               }
-                            }
-                        }
-                     }
-                  }
-               }
-            }
+###############################################################
+
+=pod 
+
+=item get_related_words
+
+Look up a word in the thesaurus.  Takes a scalar arguement and returns
+an array of words.  If the keyword is not in the thesaurus, an empty array
+will be returned.  The order of the words returned is determined by the
+database which holds them.
+
+Uses global $thesaurus_db_file.
+
+=cut
+
+###############################################################
+
+sub get_related_words {
+    my $keyword = shift;
+    my %thesaurus_db;
+    if (! -e $thesaurus_db_file) {
+        &Apache::lonnet::logthis("Attempt to access $thesaurus_db_file ".
+                                 "failed because the file does not exist");
+        return ();
+    }
+    if (! tie(%thesaurus_db,'GDBM_File',
+              $thesaurus_db_file,&GDBM_READER,0640)){
+        return ();
+    } 
+    my @Words=();
+    if (exists($thesaurus_db{$keyword})) {
+        $_ = $thesaurus_db{$keyword};
+        (undef,@Words) = split/:/;  # The first element is the number of times
+                                    # the word appears.  We do not need it now.
+        for (my $i=0;$i<=$#Words;$i++) {
+            ($Words[$i],undef)= split/\,/,$Words[$i];
         }
     }
-    return ();
+    untie %thesaurus_db;
+    return @Words;
 }
+
+###############################################################
+##              End Thesaurus Functions                      ##
+###############################################################
 
 # ---------------------------------------------------------------- Language IDs
 sub languageids {

--matthew1026484576--