[LON-CAPA-cvs] cvs: loncom /localize/localize checkduplicates.pl
bisitz
bisitz@source.lon-capa.org
Wed, 08 Apr 2009 15:10:23 -0000
bisitz Wed Apr 8 15:10:23 2009 EDT
Modified files:
/loncom/localize/localize checkduplicates.pl
Log:
Heavily optimized version how to search for duplicate keys:
- Read translation file only once and directly count key occurrences
(inclusion of lexicon hash not needed anymore; Thanks to Stefan Droeschler for the idea)
- More flexible key matching pattern
(leading white spaces)
- Optimized key matching pattern (quotes)
- Now also print amount of each duplicate key
Index: loncom/localize/localize/checkduplicates.pl
diff -u loncom/localize/localize/checkduplicates.pl:1.1 loncom/localize/localize/checkduplicates.pl:1.2
--- loncom/localize/localize/checkduplicates.pl:1.1 Tue Apr 7 10:51:53 2009
+++ loncom/localize/localize/checkduplicates.pl Wed Apr 8 15:10:22 2009
@@ -1,8 +1,9 @@
#!/usr/bin/perl
# The LearningOnline Network with CAPA
-# $Id: checkduplicates.pl,v 1.1 2009/04/07 10:51:53 bisitz Exp $
+# $Id: checkduplicates.pl,v 1.2 2009/04/08 15:10:22 bisitz Exp $
# 07.04.2009 Stefan Bisitz
+# Optimization ideas by Stefan Droeschler
use strict;
use warnings;
@@ -37,48 +38,31 @@
# Start Analysis
print "checkduplicates is searching for duplicates in $filename...\n";
-
# Manually read all stored keys from translation file (inlcuding probable duplicates)
-my @all_keys;
+# and count key occurrences in a separate hash.
+my %counter;
my $line;
open( FH, "<", $filename ) or die "$filename cannot be opened\n";
while ( !eof(FH) ) {
$line = readline(FH);
- next if $line=~/^\s*#/;
+ next if $line=~/^\s*#/; # ignore comments
#$exprNP=~s/^["'](.*)["']$/$1/; # Remove " and ' at beginning and end
- if ($line =~ m/ "(.*)"/) { # Find and save "..." key
- push(@all_keys, $1);
- } elsif ($line =~ m/ '(.*)'/) { # Find and save '...' key
- push(@all_keys, $1);
+ if ($line =~ m/^\s+["'](.*)["']/) { # Find "..." or '...' key
+ $counter{$1}++;
}
}
close(FH);
-
-# Read lexicon hash from translation file into hash
-my %lexicon = &readlexicon($filename);
-
-
-# Synch lexicon hash and Array of keys to find all doublettes
-# Check for each key in the lexicon hash if this key occures more than one time in the hash file
-# If found, print warning and count
-
+# Print all keys which occures more than one time
my $dupl = 0; # total counter to count when a key occurred more than one time
-my %found; # Hash to save keys which have already been found
-
-foreach my $lex_key (keys %lexicon) {
- my $counter = 0;
- foreach my $all_key (@all_keys) {
- if ($all_key eq $lex_key) {
- $counter++;
- if ( ($counter > 1) && (!$found{$all_key}) ) {
- $dupl++ if ($counter == 2);
- $found{$all_key} = 1;
- print 'Found duplicate key: '.$lex_key."\n";
- }
- }
+foreach my $count_key (keys %counter) {
+ my $count_value = $counter{$count_key};
+ if ($count_value > 1) {
+ print 'Found '.$count_value.' times key: '.$count_key."\n";
+ $dupl++;
}
}
+
if ($dupl == 0) {
print "Be happy - No duplicates found.\n";
} else {
@@ -86,38 +70,4 @@
}
# ----------------------------------------------------------------
-# Code taken from sync.pl
-# in : $filename
-# out: %lexicon
-
-sub readlexicon {
- # Read translation file into memory
- my $fn=shift;
- open(IN,$fn) or die;
- my %lexicon=();
- my $contents=join('',<IN>);
- close(IN);
- # Tidy up: remove header data
- $contents=~s/package Apache\:[^\;]+//;
- $contents=~s/use base[^\;]+//;
- # Build hash with hash from file
- my %Lexicon=();
- eval($contents.'; %lexicon=%Lexicon;');
- if ($@ ne "") {
- print "\nAn error occurred during the attempt to retrieve the translation hash for the file '$fn'.\n"
- ."Error: ".$@."\n";
- die;
- }
- # Remove entries which are not needed for synch
- delete $lexicon{'_AUTO'};
- delete $lexicon{'char_encoding'};
- delete $lexicon{'language_code'};
- # Hash is expected not to be empty
- if (!scalar(keys(%lexicon))) {
- print "\nWarning: No translation phrases found in '$fn'.\n";
- }
- return %lexicon;
-}
-
-# ----------------------------------------------------------------