[LON-CAPA-cvs] cvs: loncom /localize/localize checksimilar_2files.pl
bisitz
bisitz at source.lon-capa.org
Thu Jan 10 13:07:52 EST 2013
bisitz Thu Jan 10 18:07:52 2013 EDT
Modified files:
/loncom/localize/localize checksimilar_2files.pl
Log:
- Detect even more similarities by extending list of similar characters and phrases
- Script code:
- Better fitting sub routine names
- White spaces follow LON-CAPA standard
Index: loncom/localize/localize/checksimilar_2files.pl
diff -u loncom/localize/localize/checksimilar_2files.pl:1.3 loncom/localize/localize/checksimilar_2files.pl:1.4
--- loncom/localize/localize/checksimilar_2files.pl:1.3 Mon Jan 7 15:13:26 2013
+++ loncom/localize/localize/checksimilar_2files.pl Thu Jan 10 18:07:52 2013
@@ -1,6 +1,6 @@
#!/usr/bin/perl
# The LearningOnline Network with CAPA
-# $Id: checksimilar_2files.pl,v 1.3 2013/01/07 15:13:26 bisitz Exp $
+# $Id: checksimilar_2files.pl,v 1.4 2013/01/10 18:07:52 bisitz Exp $
use strict;
use warnings;
@@ -35,15 +35,16 @@
return %filecontent;
}
-sub similarities{
+sub similar_chars {
my $text = shift;
- $text =~ s/[.,\_\-?!:]//g;
+ $text =~ s/\[_\d\]//g; # translation parameters
+ $text =~ s/[.,\_\-?!: \/]//g; # punctuation
return $text;
}
-sub CourseCommunity {
+sub similar_phrases {
my $text1 = shift;
my $text2 = shift;
@@ -52,12 +53,19 @@
$text1 =~ s/communities/X001X/gi;
$text1 =~ s/course/X002X/gi;
$text1 =~ s/community/X002X/gi;
+ $text1 =~ s/member/X003X/gi;
+ $text1 =~ s/student/X003X/gi;
+ $text1 =~ s/students/X003X/gi;
+
$text2 =~ s/courses/X001X/gi;
$text2 =~ s/communities/X001X/gi;
$text2 =~ s/course/X002X/gi;
$text2 =~ s/community/X002X/gi;
+ $text2 =~ s/member/X003X/gi;
+ $text2 =~ s/student/X003X/gi;
+ $text2 =~ s/students/X003X/gi;
- if(lc($text1) eq lc($text2)) {
+ if (lc($text1) eq lc($text2)) {
return 1;
}
@@ -81,15 +89,15 @@
# For each new phrase, check if there is already a similar one
while( my ($kNEW, $vNEW) = each %langNEW ) {
my $temp1 = $kNEW;
- $temp1 = &similarities($temp1);
+ $temp1 = &similar_chars($temp1);
while( my ($kOLD, $vOLD) = each %langOLD ) {
my $temp2 = $kOLD;
- $temp2 = &similarities($temp2);
+ $temp2 = &similar_chars($temp2);
#Check for similar punctuation (case insensitive) or
- #similarity related to Course/Community
- if(lc($temp1) eq lc($temp2) || &CourseCommunity($temp1,$temp2)){
+ #similarity related to similar phrases
+ if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) {
#Find delimiter for key and value
if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {
print " (Warning: Both, ' and \", occur!)";
More information about the LON-CAPA-cvs
mailing list