Annotation of loncom/xml/LCParser.pm, revision 1.1

1.1     ! albertel    1: package HTML::LCParser;
        !             2: 
        !             3: # $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $
        !             4: 
        !             5: require HTML::PullParser;
        !             6: @ISA=qw(HTML::PullParser);
        !             7: $VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/);
        !             8: 
        !             9: use strict;
        !            10: use Carp ();
        !            11: use HTML::Entities qw(decode_entities);
        !            12: 
        !            13: my %ARGS =
        !            14: (
        !            15:  start       => "'S',tagname,attr,attrseq,text,line",
        !            16:  end         => "'E',tagname,text,line",
        !            17:  text        => "'T',text,is_cdata,line",
        !            18:  process     => "'PI',token0,text,line",
        !            19:  comment     => "'C',text,line",
        !            20:  declaration => "'D',text,line",
        !            21: );
        !            22: 
        !            23: 
        !            24: sub new
        !            25: {
        !            26:     my $class = shift;
        !            27:     my %cnf;
        !            28:     if (@_ == 1) {
        !            29: 	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
        !            30: 	%cnf = ($type => $_[0]);
        !            31:     }
        !            32:     else {
        !            33: 	%cnf = @_;
        !            34:     }
        !            35: 
        !            36:     my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
        !            37: 
        !            38:     my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
        !            39: 
        !            40:     $self->{textify} = $textify;
        !            41:     $self->encoded_entities(1);
        !            42:     $self;
        !            43: }
        !            44: 
        !            45: 
        !            46: sub get_tag
        !            47: {
        !            48:     my $self = shift;
        !            49:     my $token;
        !            50:     while (1) {
        !            51: 	$token = $self->get_token || return undef;
        !            52: 	my $type = shift @$token;
        !            53: 	next unless $type eq "S" || $type eq "E";
        !            54: 	substr($token->[0], 0, 0) = "/" if $type eq "E";
        !            55: 	return $token unless @_;
        !            56: 	for (@_) {
        !            57: 	    return $token if $token->[0] eq $_;
        !            58: 	}
        !            59:     }
        !            60: }
        !            61: 
        !            62: 
        !            63: sub get_text
        !            64: {
        !            65:     my $self = shift;
        !            66:     my $endat = shift;
        !            67:     my @text;
        !            68:     while (my $token = $self->get_token) {
        !            69: 	my $type = $token->[0];
        !            70: 	if ($type eq "T") {
        !            71: 	    my $text = $token->[1];
        !            72: 	    push(@text, $text);
        !            73: 	} elsif ($type =~ /^[SE]$/) {
        !            74: 	    my $tag = $token->[1];
        !            75: 	    if ($type eq "S") {
        !            76: 		if (exists $self->{textify}{$tag}) {
        !            77: 		    my $alt = $self->{textify}{$tag};
        !            78: 		    my $text;
        !            79: 		    if (ref($alt)) {
        !            80: 			$text = &$alt(@$token);
        !            81: 		    } else {
        !            82: 			$text = $token->[2]{$alt || "alt"};
        !            83: 			$text = "[\U$tag]" unless defined $text;
        !            84: 		    }
        !            85: 		    push(@text, $text);
        !            86: 		    next;
        !            87: 		}
        !            88: 	    } else {
        !            89: 		$tag = "/$tag";
        !            90: 	    }
        !            91: 	    if (!defined($endat) || $endat eq $tag) {
        !            92: 		 $self->unget_token($token);
        !            93: 		 last;
        !            94: 	    }
        !            95: 	}
        !            96:     }
        !            97:     join("", @text);
        !            98: }
        !            99: 
        !           100: 
        !           101: sub get_trimmed_text
        !           102: {
        !           103:     my $self = shift;
        !           104:     my $text = $self->get_text(@_);
        !           105:     $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
        !           106:     $text;
        !           107: }
        !           108: 
        !           109: 1;
        !           110: 
        !           111: 
        !           112: __END__
        !           113: 
        !           114: =head1 NAME
        !           115: 
        !           116: HTML::LCParser - Alternative HTML::Parser interface
        !           117: 
        !           118: =head1 SYNOPSIS
        !           119: 
        !           120:  require HTML::LCParser;
        !           121:  $p = HTML::LCParser->new("index.html") || die "Can't open: $!";
        !           122:  while (my $token = $p->get_token) {
        !           123:      #...
        !           124:  }
        !           125: 
        !           126: =head1 DESCRIPTION
        !           127: 
        !           128: The C<HTML::LCParser> is an alternative interface to the
        !           129: C<HTML::Parser> class.  It is an C<HTML::PullParser> subclass.
        !           130: 
        !           131: The following methods are available:
        !           132: 
        !           133: =over 4
        !           134: 
        !           135: =item $p = HTML::LCParser->new( $file_or_doc );
        !           136: 
        !           137: The object constructor argument is either a file name, a file handle
        !           138: object, or the complete document to be parsed.
        !           139: 
        !           140: If the argument is a plain scalar, then it is taken as the name of a
        !           141: file to be opened and parsed.  If the file can't be opened for
        !           142: reading, then the constructor will return an undefined value and $!
        !           143: will tell you why it failed.
        !           144: 
        !           145: If the argument is a reference to a plain scalar, then this scalar is
        !           146: taken to be the literal document to parse.  The value of this
        !           147: scalar should not be changed before all tokens have been extracted.
        !           148: 
        !           149: Otherwise the argument is taken to be some object that the
        !           150: C<HTML::LCParser> can read() from when it needs more data.  Typically
        !           151: it will be a filehandle of some kind.  The stream will be read() until
        !           152: EOF, but not closed.
        !           153: 
        !           154: It also will turn encoded_entities on by default.
        !           155: 
        !           156: =item $p->get_token
        !           157: 
        !           158: This method will return the next I<token> found in the HTML document,
        !           159: or C<undef> at the end of the document.  The token is returned as an
        !           160: array reference.  The first element of the array will be a (mostly)
        !           161: single character string denoting the type of this token: "S" for start
        !           162: tag, "E" for end tag, "T" for text, "C" for comment, "D" for
        !           163: declaration, and "PI" for process instructions.  The rest of the array
        !           164: is the same as the arguments passed to the corresponding HTML::Parser
        !           165: v2 compatible callbacks (see L<HTML::Parser>).  In summary, returned
        !           166: tokens look like this:
        !           167: 
        !           168:   ["S",  $tag, $attr, $attrseq, $text, $line]
        !           169:   ["E",  $tag, $text, $line]
        !           170:   ["T",  $text, $is_data, $line]
        !           171:   ["C",  $text, $line]
        !           172:   ["D",  $text, $line]
        !           173:   ["PI", $token0, $text, $line]
        !           174: 
        !           175: where $attr is a hash reference, $attrseq is an array reference and
        !           176: the rest are plain scalars.
        !           177: 
        !           178: =item $p->unget_token($token,...)
        !           179: 
        !           180: If you find out you have read too many tokens you can push them back,
        !           181: so that they are returned the next time $p->get_token is called.
        !           182: 
        !           183: =item $p->get_tag( [$tag, ...] )
        !           184: 
        !           185: This method returns the next start or end tag (skipping any other
        !           186: tokens), or C<undef> if there are no more tags in the document.  If
        !           187: one or more arguments are given, then we skip tokens until one of the
        !           188: specified tag types is found.  For example:
        !           189: 
        !           190:    $p->get_tag("font", "/font");
        !           191: 
        !           192: will find the next start or end tag for a font-element.
        !           193: 
        !           194: The tag information is returned as an array reference in the same form
        !           195: as for $p->get_token above, but the type code (first element) is
        !           196: missing. A start tag will be returned like this:
        !           197: 
        !           198:   [$tag, $attr, $attrseq, $text]
        !           199: 
        !           200: The tagname of end tags are prefixed with "/", i.e. end tag is
        !           201: returned like this:
        !           202: 
        !           203:   ["/$tag", $text]
        !           204: 
        !           205: =item $p->get_text( [$endtag] )
        !           206: 
        !           207: This method returns all text found at the current position. It will
        !           208: return a zero length string if the next token is not text.  The
        !           209: optional $endtag argument specifies that any text occurring before the
        !           210: given tag is to be returned. All entities are unmodified.
        !           211: 
        !           212: The $p->{textify} attribute is a hash that defines how certain tags can
        !           213: be treated as text.  If the name of a start tag matches a key in this
        !           214: hash then this tag is converted to text.  The hash value is used to
        !           215: specify which tag attribute to obtain the text from.  If this tag
        !           216: attribute is missing, then the upper case name of the tag enclosed in
        !           217: brackets is returned, e.g. "[IMG]".  The hash value can also be a
        !           218: subroutine reference.  In this case the routine is called with the
        !           219: start tag token content as its argument and the return value is treated
        !           220: as the text.
        !           221: 
        !           222: The default $p->{textify} value is:
        !           223: 
        !           224:   {img => "alt", applet => "alt"}
        !           225: 
        !           226: This means that <IMG> and <APPLET> tags are treated as text, and that
        !           227: the text to substitute can be found in the ALT attribute.
        !           228: 
        !           229: =item $p->get_trimmed_text( [$endtag] )
        !           230: 
        !           231: Same as $p->get_text above, but will collapse any sequences of white
        !           232: space to a single space character.  Leading and trailing white space is
        !           233: removed.
        !           234: 
        !           235: =back
        !           236: 
        !           237: =head1 EXAMPLES
        !           238: 
        !           239: This example extracts all links from a document.  It will print one
        !           240: line for each link, containing the URL and the textual description
        !           241: between the <A>...</A> tags:
        !           242: 
        !           243:   use HTML::LCParser;
        !           244:   $p = HTML::LCParser->new(shift||"index.html");
        !           245: 
        !           246:   while (my $token = $p->get_tag("a")) {
        !           247:       my $url = $token->[1]{href} || "-";
        !           248:       my $text = $p->get_trimmed_text("/a");
        !           249:       print "$url\t$text\n";
        !           250:   }
        !           251: 
        !           252: This example extract the <TITLE> from the document:
        !           253: 
        !           254:   use HTML::LCParser;
        !           255:   $p = HTML::LCParser->new(shift||"index.html");
        !           256:   if ($p->get_tag("title")) {
        !           257:       my $title = $p->get_trimmed_text;
        !           258:       print "Title: $title\n";
        !           259:   }
        !           260: 
        !           261: =head1 SEE ALSO
        !           262: 
        !           263: L<HTML::PullParser>, L<HTML::Parser>
        !           264: 
        !           265: =head1 COPYRIGHT
        !           266: 
        !           267: Copyright 1998-2001 Gisle Aas.
        !           268: 
        !           269: This library is free software; you can redistribute it and/or
        !           270: modify it under the same terms as Perl itself.
        !           271: 
        !           272: =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>