[LON-CAPA-cvs] cvs: modules /gerd/Wiki convert.pl
www
www at source.lon-capa.org
Tue Dec 13 13:09:52 EST 2011
www Tue Dec 13 18:09:52 2011 EDT
Modified files:
/modules/gerd/Wiki convert.pl
Log:
Taking care of redirects
Index: modules/gerd/Wiki/convert.pl
diff -u modules/gerd/Wiki/convert.pl:1.1 modules/gerd/Wiki/convert.pl:1.2
--- modules/gerd/Wiki/convert.pl:1.1 Tue Dec 13 03:05:59 2011
+++ modules/gerd/Wiki/convert.pl Tue Dec 13 18:09:52 2011
@@ -1,17 +1,23 @@
use strict;
# Read the article index file
my %residx=();
+my %idxres=();
my %resfile=();
+my %fileres=();
+my %titleres=();
my %duplicatecnt=();
open(IN,'dump/articles.dat');
while (my $line=<IN>) {
chomp($line);
my ($idx,$res)=split(/\t/,$line);
- $residx{$line}=$idx;
+ $residx{$res}=$idx;
+ $idxres{$idx}=$res;
my $file;
+ my $title;
if ($res=~/\s[\-\â]+\s/) {
my ($dir,$remainder)=split(/\s+[\-\â]+\s+/,$res);
$dir=~s/\W//g;
+ $title=$remainder;
$remainder=~s/\,/\_/g;
$remainder=~s/\s/\_/g;
$remainder=~s/\W//g;
@@ -19,6 +25,7 @@
$file=$dir.'/'.$remainder;
} else {
$file=$res;
+ $title=$res;
$file=~s/\,/\_/g;
$file=~s/\s/\_/g;
$file=~s/\W//g;
@@ -30,6 +37,37 @@
$file.='_'.$duplicatecnt{$file};
}
$resfile{$file}=$idx;
- print $file."\n";
+ $fileres{$idx}=$file;
+ $title=~s/[^\w \,\-\']//g;
+ $titleres{$idx}=$title;
}
close(IN);
+# So far:
+# residx: Wiki-reference -> index
+# idxres: index -> Wiki-reference
+# resfile: filename -> index
+# fileres: index -> filename
+# titleres: index -> clean title
+#
+# Now deal with rewrites
+#
+foreach my $idx (keys(%idxres)) {
+ open(IN,'dump/'.$idx.'.wikitext');
+ my $line=<IN>;
+ close(IN);
+ chomp ($line);
+ if ($line=~/\#REDIRECT\s*\[\[([^\]]+)\]\]$/) {
+ my $redir=$1;
+ unless ($residx{$redir}) {
+# How did that happen? The redirect points nowhere
+ print "*** WARNING: $redir not defined!\n";
+ } else {
+ my $oldref=$idxres{$idx};
+# Changing $oldref to $redir
+ $residx{$oldref}=$residx{$redir};
+ }
+# No need to visit this file again
+ delete $idxres{$idx};
+ }
+}
+# We are now only left with real files that need translation
More information about the LON-CAPA-cvs
mailing list