[LON-CAPA-cvs] cvs: loncom /xml lonxml.pm
www
lon-capa-cvs@mail.lon-capa.org
Sun, 30 Jan 2005 12:56:45 -0000
www Sun Jan 30 07:56:45 2005 EDT
Modified files:
/loncom/xml lonxml.pm
Log:
Bug #1148: Treebuilder removes perfectly fine endtags, Tidy is not ready for
use. Just manually fix the basics so the button can be used.
Index: loncom/xml/lonxml.pm
diff -u loncom/xml/lonxml.pm:1.353 loncom/xml/lonxml.pm:1.354
--- loncom/xml/lonxml.pm:1.353 Fri Jan 28 16:08:45 2005
+++ loncom/xml/lonxml.pm Sun Jan 30 07:56:45 2005
@@ -1,7 +1,7 @@
# The LearningOnline Network with CAPA
# XML Parser Module
#
-# $Id: lonxml.pm,v 1.353 2005/01/28 21:08:45 albertel Exp $
+# $Id: lonxml.pm,v 1.354 2005/01/30 12:56:45 www Exp $
#
# Copyright Michigan State University Board of Trustees
#
@@ -368,23 +368,28 @@
sub htmlclean {
my ($raw,$full)=@_;
+# Take care of CRLF etc
- my $tree = HTML::TreeBuilder->new;
- $tree->ignore_unknown(0);
-
- $tree->parse($raw);
-
- my $output= $tree->as_HTML(undef,' ');
-
- $output=~s/\<(br|hr|img|meta|allow)(.*?)\>/\<$1$2 \/\>/gis;
- $output=~s/\<\/(br|hr|img|meta|allow)\>//gis;
+ $raw=~s/\r\f/\n/gs; $raw=~s/\f\r/\n/gs;
+ $raw=~s/\r\n/\n/gs; $raw=~s/\n\r/\n/gs;
+ $raw=~s/\f/\n/gs; $raw=~s/\r/\n/gs;
+ $raw=~s/\&\#10\;/\n/gs; $raw=~s/\&\#13\;/\n/gs;
+
+# Generate empty tags, remove wrong end tags
+ $raw=~s/\<(br|hr|img|meta|allow|basefont)([^\>\/]*?)\>/\<$1$2 \/\>/gis;
+ $raw=~s/\<\/(br|hr|img|meta|allow|basefont)\>//gis;
unless ($full) {
- $output=~s/\<[\/]*(body|head|html)\>//gis;
+ $raw=~s/\<[\/]*(body|head|html)\>//gis;
}
-
- $tree = $tree->delete;
-
- return $output;
+# Make standard tags lowercase
+ foreach ('html','body','head','meta','h1','h2','h3','h4','b','i','m',
+ 'table','tr','td','th','p','br','hr','img','embed','font',
+ 'a','strong','center','title','basefont') {
+ $raw=~s/\<$_\s*\>/\<$_\>/gis;
+ $raw=~s/\<\/$_\s*\>/<\/$_\>/gis;
+ $raw=~s/\<$_\s([^\>]*)\>/<$_ $1\>/gis;
+ }
+ return $raw;
}
sub latex_special_symbols {