[LON-CAPA-cvs] cvs: loncom /xml lonxml.pm

www lon-capa-cvs@mail.lon-capa.org
Sun, 30 Jan 2005 12:56:45 -0000


www		Sun Jan 30 07:56:45 2005 EDT

  Modified files:              
    /loncom/xml	lonxml.pm 
  Log:
  Bug #1148: Treebuilder removes perfectly fine endtags, Tidy is not ready for
  use. Just manually fix the basics so the button can be used.
  
  
Index: loncom/xml/lonxml.pm
diff -u loncom/xml/lonxml.pm:1.353 loncom/xml/lonxml.pm:1.354
--- loncom/xml/lonxml.pm:1.353	Fri Jan 28 16:08:45 2005
+++ loncom/xml/lonxml.pm	Sun Jan 30 07:56:45 2005
@@ -1,7 +1,7 @@
 # The LearningOnline Network with CAPA
 # XML Parser Module 
 #
-# $Id: lonxml.pm,v 1.353 2005/01/28 21:08:45 albertel Exp $
+# $Id: lonxml.pm,v 1.354 2005/01/30 12:56:45 www Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -368,23 +368,28 @@
 
 sub htmlclean {
     my ($raw,$full)=@_;
+# Take care of CRLF etc
 
-    my $tree = HTML::TreeBuilder->new;
-    $tree->ignore_unknown(0);
-
-    $tree->parse($raw);
-
-    my $output= $tree->as_HTML(undef,' ');
-
-    $output=~s/\<(br|hr|img|meta|allow)(.*?)\>/\<$1$2 \/\>/gis;
-    $output=~s/\<\/(br|hr|img|meta|allow)\>//gis;
+    $raw=~s/\r\f/\n/gs; $raw=~s/\f\r/\n/gs;
+    $raw=~s/\r\n/\n/gs; $raw=~s/\n\r/\n/gs;
+    $raw=~s/\f/\n/gs; $raw=~s/\r/\n/gs;
+    $raw=~s/\&\#10\;/\n/gs; $raw=~s/\&\#13\;/\n/gs;
+
+# Generate empty tags, remove wrong end tags
+    $raw=~s/\<(br|hr|img|meta|allow|basefont)([^\>\/]*?)\>/\<$1$2 \/\>/gis;
+    $raw=~s/\<\/(br|hr|img|meta|allow|basefont)\>//gis;
     unless ($full) {
-       $output=~s/\<[\/]*(body|head|html)\>//gis;
+       $raw=~s/\<[\/]*(body|head|html)\>//gis;
     }
-
-    $tree = $tree->delete;
-
-    return $output;
+# Make standard tags lowercase
+    foreach ('html','body','head','meta','h1','h2','h3','h4','b','i','m',
+             'table','tr','td','th','p','br','hr','img','embed','font',
+             'a','strong','center','title','basefont') {
+	$raw=~s/\<$_\s*\>/\<$_\>/gis;
+        $raw=~s/\<\/$_\s*\>/<\/$_\>/gis;
+        $raw=~s/\<$_\s([^\>]*)\>/<$_ $1\>/gis;
+    }
+    return $raw;
 }
 
 sub latex_special_symbols {