--- loncom/publisher/lonpublisher.pm 2008/08/01 18:11:19 1.244 +++ loncom/publisher/lonpublisher.pm 2008/08/14 13:39:02 1.245 @@ -1,7 +1,7 @@ # The LearningOnline Network with CAPA # Publication Handler # -# $Id: lonpublisher.pm,v 1.244 2008/08/01 18:11:19 bisitz Exp $ +# $Id: lonpublisher.pm,v 1.245 2008/08/14 13:39:02 onken Exp $ # # Copyright Michigan State University Board of Trustees # @@ -118,6 +118,8 @@ use Apache::File; use File::Copy; use Apache::Constants qw(:common :http :methods); use HTML::LCParser; +use HTML::Entities; +use Encode::Encoder; use Apache::lonxml; use Apache::loncacc; use DBI; @@ -1172,17 +1174,22 @@ sub publish { $textonly=~s/\//g; $textonly=~s/\[^\<]+\<\/m\>//g; $textonly=~s/\<[^\>]*\>//g; - $textonly=~tr/A-Z/a-z/; - $textonly=~s/[\$\&][a-z]\w*//g; - $textonly=~s/[^a-z\s]//g; - - foreach ($textonly=~m/(\w+)/g) { - unless ($nokey{$_}) { - $keywords{$_}=1; - } - } - } + #this is a work simplification for german authors for present + $textonly=HTML::Entities::decode($textonly); #decode HTML-character + $textonly=Encode::Encoder::encode('utf8', $textonly); #encode to perl internal unicode + $textonly=~tr/A-ZÜÄÖ/a-züäö/; #add lowercase rule for german "Umlaute" + $textonly=~s/[\$\&][a-z]\w*//g; + $textonly=~s/[^a-z^ü^ä^ö^ß\s]//g; #dont delete german "Umlaute" + + foreach ($textonly=~m/[^\s]+/g) { #match all but whitespaces + unless ($nokey{$_}) { + $keywords{$_}=1; + } + } + + + } foreach my $addkey (split(/[\"\'\,\;]/,$metadatafields{'keywords'})) { $addkey=~s/\s+/ /g;