File:  [LON-CAPA] / loncom / xml / LCParser.pm
Revision 1.2: download - view: text, annotated - select for diffs
Mon Apr 1 21:47:13 2002 UTC (22 years, 1 month ago) by albertel
Branches: MAIN
CVS tags: version_2_9_X, version_2_9_99_0, version_2_9_1, version_2_9_0, version_2_8_X, version_2_8_99_1, version_2_8_99_0, version_2_8_2, version_2_8_1, version_2_8_0, version_2_7_X, version_2_7_99_1, version_2_7_99_0, version_2_7_1, version_2_7_0, version_2_6_X, version_2_6_99_1, version_2_6_99_0, version_2_6_3, version_2_6_2, version_2_6_1, version_2_6_0, version_2_5_X, version_2_5_99_1, version_2_5_99_0, version_2_5_2, version_2_5_1, version_2_5_0, version_2_4_X, version_2_4_99_0, version_2_4_2, version_2_4_1, version_2_4_0, version_2_3_X, version_2_3_99_0, version_2_3_2, version_2_3_1, version_2_3_0, version_2_2_X, version_2_2_99_1, version_2_2_99_0, version_2_2_2, version_2_2_1, version_2_2_0, version_2_1_X, version_2_1_99_3, version_2_1_99_2, version_2_1_99_1, version_2_1_99_0, version_2_1_3, version_2_1_2, version_2_1_1, version_2_1_0, version_2_12_X, version_2_11_X, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, version_2_11_2_uiuc, version_2_11_2_msu, version_2_11_2_educog, version_2_11_2, version_2_11_1, version_2_11_0_RC3, version_2_11_0_RC2, version_2_11_0_RC1, version_2_11_0, version_2_10_X, version_2_10_1, version_2_10_0_RC2, version_2_10_0_RC1, version_2_10_0, version_2_0_X, version_2_0_99_1, version_2_0_2, version_2_0_1, version_2_0_0, version_1_99_3, version_1_99_2, version_1_99_1_tmcc, version_1_99_1, version_1_99_0_tmcc, version_1_99_0, version_1_3_X, version_1_3_3, version_1_3_2, version_1_3_1, version_1_3_0, version_1_2_X, version_1_2_99_1, version_1_2_99_0, version_1_2_1, version_1_2_0, version_1_1_X, version_1_1_99_5, version_1_1_99_4, version_1_1_99_3, version_1_1_99_2, version_1_1_99_1, version_1_1_99_0, version_1_1_3, version_1_1_2, version_1_1_1, version_1_1_0, version_1_0_99_3, version_1_0_99_2, version_1_0_99_1, version_1_0_99, version_1_0_3, version_1_0_2, version_1_0_1, version_1_0_0, version_0_99_5, version_0_99_4, version_0_99_3, version_0_99_2, version_0_99_1, version_0_99_0, version_0_6_2, version_0_6, version_0_5_1, version_0_5, version_0_4, stable_2002_july, stable_2002_april, loncapaMITrelate_1, language_hyphenation_merge, language_hyphenation, conference_2003, bz6209-base, bz6209, bz5969, bz2851, STABLE, PRINT_INCOMPLETE_base, PRINT_INCOMPLETE, HEAD, GCI_3, GCI_2, GCI_1, BZ5971-printing-apage, BZ5434-fox, BZ4492-merge, BZ4492-feature_horizontal_radioresponse
- conforms to HTML-Parser 3.26

    1: package HTML::LCParser;
    2: 
    3: # $Id: LCParser.pm,v 1.2 2002/04/01 21:47:13 albertel Exp $
    4: 
    5: require HTML::PullParser;
    6: @ISA=qw(HTML::PullParser);
    7: $VERSION = sprintf("%d.%02d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/);
    8: 
    9: use strict;
   10: use Carp ();
   11: use HTML::Entities qw(decode_entities);
   12: 
   13: my %ARGS =
   14: (
   15:  start       => "'S',tagname,attr,attrseq,text,line",
   16:  end         => "'E',tagname,text,line",
   17:  text        => "'T',text,is_cdata,line",
   18:  process     => "'PI',token0,text,line",
   19:  comment     => "'C',text,line",
   20:  declaration => "'D',text,line",
   21: );
   22: 
   23: 
   24: sub new
   25: {
   26:     my $class = shift;
   27:     my %cnf;
   28:     if (@_ == 1) {
   29: 	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
   30: 	%cnf = ($type => $_[0]);
   31:     }
   32:     else {
   33: 	%cnf = @_;
   34:     }
   35: 
   36:     my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
   37: 
   38:     my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
   39: 
   40:     $self->{textify} = $textify;
   41:     $self->attr_encoded(1);
   42:     $self;
   43: }
   44: 
   45: 
   46: sub get_tag
   47: {
   48:     my $self = shift;
   49:     my $token;
   50:     while (1) {
   51: 	$token = $self->get_token || return undef;
   52: 	my $type = shift @$token;
   53: 	next unless $type eq "S" || $type eq "E";
   54: 	substr($token->[0], 0, 0) = "/" if $type eq "E";
   55: 	return $token unless @_;
   56: 	for (@_) {
   57: 	    return $token if $token->[0] eq $_;
   58: 	}
   59:     }
   60: }
   61: 
   62: 
   63: sub get_text
   64: {
   65:     my $self = shift;
   66:     my $endat = shift;
   67:     my @text;
   68:     while (my $token = $self->get_token) {
   69: 	my $type = $token->[0];
   70: 	if ($type eq "T") {
   71: 	    my $text = $token->[1];
   72: 	    push(@text, $text);
   73: 	} elsif ($type =~ /^[SE]$/) {
   74: 	    my $tag = $token->[1];
   75: 	    if ($type eq "S") {
   76: 		if (exists $self->{textify}{$tag}) {
   77: 		    my $alt = $self->{textify}{$tag};
   78: 		    my $text;
   79: 		    if (ref($alt)) {
   80: 			$text = &$alt(@$token);
   81: 		    } else {
   82: 			$text = $token->[2]{$alt || "alt"};
   83: 			$text = "[\U$tag]" unless defined $text;
   84: 		    }
   85: 		    push(@text, $text);
   86: 		    next;
   87: 		}
   88: 	    } else {
   89: 		$tag = "/$tag";
   90: 	    }
   91: 	    if (!defined($endat) || $endat eq $tag) {
   92: 		 $self->unget_token($token);
   93: 		 last;
   94: 	    }
   95: 	}
   96:     }
   97:     join("", @text);
   98: }
   99: 
  100: 
  101: sub get_trimmed_text
  102: {
  103:     my $self = shift;
  104:     my $text = $self->get_text(@_);
  105:     $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
  106:     $text;
  107: }
  108: 
  109: 1;
  110: 
  111: 
  112: __END__
  113: 
  114: =head1 NAME
  115: 
  116: HTML::LCParser - Alternative HTML::Parser interface
  117: 
  118: =head1 SYNOPSIS
  119: 
  120:  require HTML::LCParser;
  121:  $p = HTML::LCParser->new("index.html") || die "Can't open: $!";
  122:  while (my $token = $p->get_token) {
  123:      #...
  124:  }
  125: 
  126: =head1 DESCRIPTION
  127: 
  128: The C<HTML::LCParser> is an alternative interface to the
  129: C<HTML::Parser> class.  It is an C<HTML::PullParser> subclass.
  130: 
  131: The following methods are available:
  132: 
  133: =over 4
  134: 
  135: =item $p = HTML::LCParser->new( $file_or_doc );
  136: 
  137: The object constructor argument is either a file name, a file handle
  138: object, or the complete document to be parsed.
  139: 
  140: If the argument is a plain scalar, then it is taken as the name of a
  141: file to be opened and parsed.  If the file can't be opened for
  142: reading, then the constructor will return an undefined value and $!
  143: will tell you why it failed.
  144: 
  145: If the argument is a reference to a plain scalar, then this scalar is
  146: taken to be the literal document to parse.  The value of this
  147: scalar should not be changed before all tokens have been extracted.
  148: 
  149: Otherwise the argument is taken to be some object that the
  150: C<HTML::LCParser> can read() from when it needs more data.  Typically
  151: it will be a filehandle of some kind.  The stream will be read() until
  152: EOF, but not closed.
  153: 
  154: It also will turn attr_encoded on by default.
  155: 
  156: =item $p->get_token
  157: 
  158: This method will return the next I<token> found in the HTML document,
  159: or C<undef> at the end of the document.  The token is returned as an
  160: array reference.  The first element of the array will be a (mostly)
  161: single character string denoting the type of this token: "S" for start
  162: tag, "E" for end tag, "T" for text, "C" for comment, "D" for
  163: declaration, and "PI" for process instructions.  The rest of the array
  164: is the same as the arguments passed to the corresponding HTML::Parser
  165: v2 compatible callbacks (see L<HTML::Parser>).  In summary, returned
  166: tokens look like this:
  167: 
  168:   ["S",  $tag, $attr, $attrseq, $text, $line]
  169:   ["E",  $tag, $text, $line]
  170:   ["T",  $text, $is_data, $line]
  171:   ["C",  $text, $line]
  172:   ["D",  $text, $line]
  173:   ["PI", $token0, $text, $line]
  174: 
  175: where $attr is a hash reference, $attrseq is an array reference and
  176: the rest are plain scalars.
  177: 
  178: =item $p->unget_token($token,...)
  179: 
  180: If you find out you have read too many tokens you can push them back,
  181: so that they are returned the next time $p->get_token is called.
  182: 
  183: =item $p->get_tag( [$tag, ...] )
  184: 
  185: This method returns the next start or end tag (skipping any other
  186: tokens), or C<undef> if there are no more tags in the document.  If
  187: one or more arguments are given, then we skip tokens until one of the
  188: specified tag types is found.  For example:
  189: 
  190:    $p->get_tag("font", "/font");
  191: 
  192: will find the next start or end tag for a font-element.
  193: 
  194: The tag information is returned as an array reference in the same form
  195: as for $p->get_token above, but the type code (first element) is
  196: missing. A start tag will be returned like this:
  197: 
  198:   [$tag, $attr, $attrseq, $text]
  199: 
  200: The tagname of end tags are prefixed with "/", i.e. end tag is
  201: returned like this:
  202: 
  203:   ["/$tag", $text]
  204: 
  205: =item $p->get_text( [$endtag] )
  206: 
  207: This method returns all text found at the current position. It will
  208: return a zero length string if the next token is not text.  The
  209: optional $endtag argument specifies that any text occurring before the
  210: given tag is to be returned. All entities are unmodified.
  211: 
  212: The $p->{textify} attribute is a hash that defines how certain tags can
  213: be treated as text.  If the name of a start tag matches a key in this
  214: hash then this tag is converted to text.  The hash value is used to
  215: specify which tag attribute to obtain the text from.  If this tag
  216: attribute is missing, then the upper case name of the tag enclosed in
  217: brackets is returned, e.g. "[IMG]".  The hash value can also be a
  218: subroutine reference.  In this case the routine is called with the
  219: start tag token content as its argument and the return value is treated
  220: as the text.
  221: 
  222: The default $p->{textify} value is:
  223: 
  224:   {img => "alt", applet => "alt"}
  225: 
  226: This means that <IMG> and <APPLET> tags are treated as text, and that
  227: the text to substitute can be found in the ALT attribute.
  228: 
  229: =item $p->get_trimmed_text( [$endtag] )
  230: 
  231: Same as $p->get_text above, but will collapse any sequences of white
  232: space to a single space character.  Leading and trailing white space is
  233: removed.
  234: 
  235: =back
  236: 
  237: =head1 EXAMPLES
  238: 
  239: This example extracts all links from a document.  It will print one
  240: line for each link, containing the URL and the textual description
  241: between the <A>...</A> tags:
  242: 
  243:   use HTML::LCParser;
  244:   $p = HTML::LCParser->new(shift||"index.html");
  245: 
  246:   while (my $token = $p->get_tag("a")) {
  247:       my $url = $token->[1]{href} || "-";
  248:       my $text = $p->get_trimmed_text("/a");
  249:       print "$url\t$text\n";
  250:   }
  251: 
  252: This example extract the <TITLE> from the document:
  253: 
  254:   use HTML::LCParser;
  255:   $p = HTML::LCParser->new(shift||"index.html");
  256:   if ($p->get_tag("title")) {
  257:       my $title = $p->get_trimmed_text;
  258:       print "Title: $title\n";
  259:   }
  260: 
  261: =head1 SEE ALSO
  262: 
  263: L<HTML::PullParser>, L<HTML::Parser>
  264: 
  265: =head1 COPYRIGHT
  266: 
  267: Copyright 1998-2001 Gisle Aas.
  268: 
  269: This library is free software; you can redistribute it and/or
  270: modify it under the same terms as Perl itself.
  271: 
  272: =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>