[LON-CAPA-cvs] cvs: loncom /localize/localize checksimilar_1file.pl checksimilar_2files.pl
wenzelju
wenzelju@source.lon-capa.org
Tue, 09 Mar 2010 15:16:26 -0000
wenzelju Tue Mar 9 15:16:26 2010 EDT
Added files:
/loncom/localize/localize checksimilar_1file.pl
checksimilar_2files.pl
Log:
Scripts to check for similar phrases in translationfiles.
Index: loncom/localize/localize/checksimilar_1file.pl
+++ loncom/localize/localize/checksimilar_1file.pl
#!/usr/bin/perl
# The LearningOnline Network with CAPA
# $Id: checksimilar_1file.pl,v 1.1 2010/03/09 15:16:26 wenzelju Exp $
use strict;
use warnings;
####
#### Checks, if there are similar keys in the inputfile (for example de.pm)
####
####--------Subroutines--------####
sub read {
# Read file into memory
my $fn = shift;
open(IN,$fn) or die;
my %filecontent = ();
my $contents = join('',<IN>);
close(IN);
# Build hash with hash from file
my %Lexicon=();
eval($contents.'; %filecontent=%Lexicon;');
return %filecontent;
}
sub similarities{
my $text = shift;
$text =~ s/[.,\_\-?!:]//g;
return $text;
}
####--------Main programm--------####
my $file = $ARGV[0];
my %lang=&read($file);
my $count = 0;
#Copy hash for comparision
my %lang2=%lang;
my %sim;
#For each key in the hash compare it with each other key in the hash except itself
while( my ($kOUT, $vOUT) = each %lang ) {
#Delete the current key, so that it does not find itself
#(revert this action later, see below)
delete $lang2{$kOUT};
my $temp = $kOUT;
$temp = &similarities($temp);
while( my ($kIN, $vIN) = each %lang2 ) {
my $temp2 = $kIN;
$temp2 = &similarities($temp2);
#Print key, if it has similarity to another key and if it has not been checked already
if(lc($temp) eq lc($temp2) && !($sim{$kOUT})){
print ('###'.$kOUT."###".$kIN."###\n");
#Remeber key as already checked
$sim{$kIN} = $kOUT;
$count++;
}
}
$lang2{$kOUT}=$vOUT;
}
print("Finished. ".$count." similar keys found.\n");
Index: loncom/localize/localize/checksimilar_2files.pl
+++ loncom/localize/localize/checksimilar_2files.pl
#!/usr/bin/perl
# The LearningOnline Network with CAPA
# $Id: checksimilar_2files.pl,v 1.1 2010/03/09 15:16:26 wenzelju Exp $
use strict;
use warnings;
use utf8;
use open ':utf8';
####
#### Checks, if there are similar keys in the two inputfiles.
#### For example, check the current lang.pm (first input) and newphrases.
#### So if there are similar keys you don't have to translate
#### them again but use the old value and just modify it.
#### IMPORTANT: Both inputfiles have to contain a hash %Lexicon (like lang.pm) !!!
####--------Subroutines--------####
sub read {
# Read file into memory
my $file = shift;
open(IN,$file) or die;
my %filecontent = ();
my $contents = join('',<IN>);
close(IN);
# Build hash with hash from file
my %Lexicon = ();
eval($contents.'; %filecontent=%Lexicon;');
if ($@ ne "") {
print "\nAn error occurred during the attempt to retrieve the translation hash.\n"
."Error: ".$@."\n";
die;
}
return %filecontent;
}
sub similarities{
my $text = shift;
$text =~ s/[.,\_\-?!:]//g;
return $text;
}
sub CourseCommunity {
my $text1 = shift;
my $text2 = shift;
$text1 =~ s/courses//gi;
$text1 =~ s/communities//gi;
$text1 =~ s/course//gi;
$text1 =~ s/community//gi;
$text2 =~ s/courses//gi;
$text2 =~ s/communities//gi;
$text2 =~ s/course//gi;
$text2 =~ s/community//gi;
if(lc($text1) eq lc($text2)) {
return 1;
}
return 0;
}
####--------Main Program--------####
my $file1 = $ARGV[0]; # Old language.pm
my $file2 = $ARGV[1]; # New Phrases
my %langOLD = &read($file1); #Hash with old phrases
my %langNEW = &read($file2); #Hash with new phrases
my $dlm;
my $count = 1; #Counter
open(OUT,'>similarities.txt') or die;
# For each new phrase, check if there is already a similar one
while( my ($kNEW, $vNEW) = each %langNEW ) {
my $temp1 = $kNEW;
$temp1 = &similarities($temp1);
while( my ($kOLD, $vOLD) = each %langOLD ) {
my $temp2 = $kOLD;
$temp2 = &similarities($temp2);
#Check for similar punctuation (case insensitive) or
#similarity related to Course/Community
if(lc($temp1) eq lc($temp2) || &CourseCommunity($temp1,$temp2)){
#Find delimiter for key and value
if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {
print " (Warning: Both, ' and \", occur!)";
}
if ($kNEW=~/\'/) {
$dlm = '"';
} else {
$dlm = "'";
}
print OUT (<<ENDNEW);
#Old key: $kOLD
$dlm$kNEW$dlm
=> $dlm$vOLD$dlm,
ENDNEW
$count++;
}
}
}
print("Finished. ".$count." similar expressions found!\n");