[LON-CAPA-cvs] cvs: loncom /localize/localize checkduplicates.pl

Wed, 08 Apr 2009 15:10:23 -0000

bisitz		Wed Apr  8 15:10:23 2009 EDT

  Modified files:              
    /loncom/localize/localize	checkduplicates.pl 
  Log:
  Heavily optimized version how to search for duplicate keys:
  - Read translation file only once and directly count key occurrences
    (inclusion of lexicon hash not needed anymore; Thanks to Stefan Droeschler for the idea)
  - More flexible key matching pattern
    (leading white spaces)
  - Optimized key matching pattern (quotes)
  - Now also print amount of each duplicate key
  
  
Index: loncom/localize/localize/checkduplicates.pl
diff -u loncom/localize/localize/checkduplicates.pl:1.1 loncom/localize/localize/checkduplicates.pl:1.2

--- loncom/localize/localize/checkduplicates.pl:1.1	Tue Apr  7 10:51:53 2009
+++ loncom/localize/localize/checkduplicates.pl	Wed Apr  8 15:10:22 2009
@@ -1,8 +1,9 @@
 #!/usr/bin/perl
 # The LearningOnline Network with CAPA
-# $Id: checkduplicates.pl,v 1.1 2009/04/07 10:51:53 bisitz Exp $
+# $Id: checkduplicates.pl,v 1.2 2009/04/08 15:10:22 bisitz Exp $
 
 # 07.04.2009 Stefan Bisitz
+# Optimization ideas by Stefan Droeschler
 
 use strict;
 use warnings;
@@ -37,48 +38,31 @@
 # Start Analysis
 print "checkduplicates is searching for duplicates in $filename...\n";
 
-
 # Manually read all stored keys from translation file (inlcuding probable duplicates)
-my @all_keys;
+# and count key occurrences in a separate hash.
+my %counter;
 my $line;
 open( FH, "<", $filename ) or die "$filename cannot be opened\n";
 while ( !eof(FH) ) {
     $line = readline(FH);
-    next if $line=~/^\s*#/;
+    next if $line=~/^\s*#/; # ignore comments
     #$exprNP=~s/^["'](.*)["']$/$1/; # Remove " and ' at beginning and end
-    if ($line =~ m/   "(.*)"/) { # Find and save "..." key
-        push(@all_keys, $1);
-    } elsif ($line =~ m/   '(.*)'/) { # Find and save '...' key
-        push(@all_keys, $1);
+    if ($line =~ m/^\s+["'](.*)["']/) { # Find "..." or '...' key
+        $counter{$1}++;
     }
 }
 close(FH);
 
-
-# Read lexicon hash from translation file into hash
-my %lexicon = &readlexicon($filename);
-
-
-# Synch lexicon hash and Array of keys to find all doublettes
-# Check for each key in the lexicon hash if this key occures more than one time in the hash file
-# If found, print warning and count
-
+# Print all keys which occures more than one time
 my $dupl = 0; # total counter to count when a key occurred more than one time
-my %found; # Hash to save keys which have already been found
-
-foreach my $lex_key (keys %lexicon) {
-    my $counter = 0;
-    foreach my $all_key (@all_keys) {
-        if ($all_key eq $lex_key) {
-            $counter++;
-            if ( ($counter > 1) && (!$found{$all_key}) ) {
-                $dupl++ if ($counter == 2);
-                $found{$all_key} = 1;
-                print 'Found duplicate key: '.$lex_key."\n";
-            }
-        }
+foreach my $count_key (keys %counter) {
+    my $count_value = $counter{$count_key};
+    if ($count_value > 1) {
+        print 'Found '.$count_value.' times key: '.$count_key."\n";
+        $dupl++;
     }
 }
+
 if ($dupl == 0) {
     print "Be happy - No duplicates found.\n";
 } else {
@@ -86,38 +70,4 @@
 }
 
 # ----------------------------------------------------------------
-# Code taken from sync.pl
-# in : $filename
-# out: %lexicon
-
-sub readlexicon {
-    # Read translation file into memory
-    my $fn=shift;
-    open(IN,$fn) or die;
-    my %lexicon=();
-    my $contents=join('',<IN>);
-    close(IN);
-    # Tidy up: remove header data
-    $contents=~s/package Apache\:[^\;]+//;
-    $contents=~s/use base[^\;]+//;
-    # Build hash with hash from file
-    my %Lexicon=();
-    eval($contents.'; %lexicon=%Lexicon;');
-    if ($@ ne "") {
-        print "\nAn error occurred during the attempt to retrieve the translation hash for the file '$fn'.\n"
-             ."Error: ".$@."\n";
-        die;
-    }
-    # Remove entries which are not needed for synch
-    delete $lexicon{'_AUTO'};
-    delete $lexicon{'char_encoding'};
-    delete $lexicon{'language_code'};
-    # Hash is expected not to be empty
-    if (!scalar(keys(%lexicon))) {
-        print "\nWarning: No translation phrases found in '$fn'.\n";
-    }
-    return %lexicon;
-}
-
-# ----------------------------------------------------------------