loncom/xml/LCParser.pm - view

File: [LON-CAPA] / loncom / xml / LCParser.pm
Revision 1.2: download - view: text, annotated - select for diffs
Mon Apr 1 21:47:13 2002 UTC (22 years, 1 month ago) by albertel
Branches: MAIN
CVS tags: version_2_9_X, version_2_9_99_0, version_2_9_1, version_2_9_0, version_2_8_X, version_2_8_99_1, version_2_8_99_0, version_2_8_2, version_2_8_1, version_2_8_0, version_2_7_X, version_2_7_99_1, version_2_7_99_0, version_2_7_1, version_2_7_0, version_2_6_X, version_2_6_99_1, version_2_6_99_0, version_2_6_3, version_2_6_2, version_2_6_1, version_2_6_0, version_2_5_X, version_2_5_99_1, version_2_5_99_0, version_2_5_2, version_2_5_1, version_2_5_0, version_2_4_X, version_2_4_99_0, version_2_4_2, version_2_4_1, version_2_4_0, version_2_3_X, version_2_3_99_0, version_2_3_2, version_2_3_1, version_2_3_0, version_2_2_X, version_2_2_99_1, version_2_2_99_0, version_2_2_2, version_2_2_1, version_2_2_0, version_2_1_X, version_2_1_99_3, version_2_1_99_2, version_2_1_99_1, version_2_1_99_0, version_2_1_3, version_2_1_2, version_2_1_1, version_2_1_0, version_2_12_X, version_2_11_X, version_2_11_4_uiuc, version_2_11_4_msu, version_2_11_4, version_2_11_3_uiuc, version_2_11_3_msu, version_2_11_3, version_2_11_2_uiuc, version_2_11_2_msu, version_2_11_2_educog, version_2_11_2, version_2_11_1, version_2_11_0_RC3, version_2_11_0_RC2, version_2_11_0_RC1, version_2_11_0, version_2_10_X, version_2_10_1, version_2_10_0_RC2, version_2_10_0_RC1, version_2_10_0, version_2_0_X, version_2_0_99_1, version_2_0_2, version_2_0_1, version_2_0_0, version_1_99_3, version_1_99_2, version_1_99_1_tmcc, version_1_99_1, version_1_99_0_tmcc, version_1_99_0, version_1_3_X, version_1_3_3, version_1_3_2, version_1_3_1, version_1_3_0, version_1_2_X, version_1_2_99_1, version_1_2_99_0, version_1_2_1, version_1_2_0, version_1_1_X, version_1_1_99_5, version_1_1_99_4, version_1_1_99_3, version_1_1_99_2, version_1_1_99_1, version_1_1_99_0, version_1_1_3, version_1_1_2, version_1_1_1, version_1_1_0, version_1_0_99_3, version_1_0_99_2, version_1_0_99_1, version_1_0_99, version_1_0_3, version_1_0_2, version_1_0_1, version_1_0_0, version_0_99_5, version_0_99_4, version_0_99_3, version_0_99_2, version_0_99_1, version_0_99_0, version_0_6_2, version_0_6, version_0_5_1, version_0_5, version_0_4, stable_2002_july, stable_2002_april, loncapaMITrelate_1, language_hyphenation_merge, language_hyphenation, conference_2003, bz6209-base, bz6209, bz5969, bz2851, STABLE, PRINT_INCOMPLETE_base, PRINT_INCOMPLETE, HEAD, GCI_3, GCI_2, GCI_1, BZ5971-printing-apage, BZ5434-fox, BZ4492-merge, BZ4492-feature_horizontal_radioresponse

- conforms to HTML-Parser 3.26

1: package HTML::LCParser; 2: 3: # $Id: LCParser.pm,v 1.2 2002/04/01 21:47:13 albertel Exp $ 4: 5: require HTML::PullParser; 6: @ISA=qw(HTML::PullParser); 7: $VERSION = sprintf("%d.%02d", q$Revision: 1.2 $ =~ /(\d+)\.(\d+)/); 8: 9: use strict; 10: use Carp (); 11: use HTML::Entities qw(decode_entities); 12: 13: my %ARGS = 14: ( 15: start => "'S',tagname,attr,attrseq,text,line", 16: end => "'E',tagname,text,line", 17: text => "'T',text,is_cdata,line", 18: process => "'PI',token0,text,line", 19: comment => "'C',text,line", 20: declaration => "'D',text,line", 21: ); 22: 23: 24: sub new 25: { 26: my $class = shift; 27: my %cnf; 28: if (@_ == 1) { 29: my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; 30: %cnf = ($type => $_[0]); 31: } 32: else { 33: %cnf = @_; 34: } 35: 36: my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; 37: 38: my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; 39: 40: $self->{textify} = $textify; 41: $self->attr_encoded(1); 42: $self; 43: } 44: 45: 46: sub get_tag 47: { 48: my $self = shift; 49: my $token; 50: while (1) { 51: $token = $self->get_token || return undef; 52: my $type = shift @$token; 53: next unless $type eq "S" || $type eq "E"; 54: substr($token->[0], 0, 0) = "/" if $type eq "E"; 55: return $token unless @_; 56: for (@_) { 57: return $token if $token->[0] eq $_; 58: } 59: } 60: } 61: 62: 63: sub get_text 64: { 65: my $self = shift; 66: my $endat = shift; 67: my @text; 68: while (my $token = $self->get_token) { 69: my $type = $token->[0]; 70: if ($type eq "T") { 71: my $text = $token->[1]; 72: push(@text, $text); 73: } elsif ($type =~ /^[SE]$/) { 74: my $tag = $token->[1]; 75: if ($type eq "S") { 76: if (exists $self->{textify}{$tag}) { 77: my $alt = $self->{textify}{$tag}; 78: my $text; 79: if (ref($alt)) { 80: $text = &$alt(@$token); 81: } else { 82: $text = $token->[2]{$alt || "alt"}; 83: $text = "[\U$tag]" unless defined $text; 84: } 85: push(@text, $text); 86: next; 87: } 88: } else { 89: $tag = "/$tag"; 90: } 91: if (!defined($endat) || $endat eq $tag) { 92: $self->unget_token($token); 93: last; 94: } 95: } 96: } 97: join("", @text); 98: } 99: 100: 101: sub get_trimmed_text 102: { 103: my $self = shift; 104: my $text = $self->get_text(@_); 105: $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; 106: $text; 107: } 108: 109: 1; 110: 111: 112: __END__ 113: 114: =head1 NAME 115: 116: HTML::LCParser - Alternative HTML::Parser interface 117: 118: =head1 SYNOPSIS 119: 120: require HTML::LCParser; 121: $p = HTML::LCParser->new("index.html") || die "Can't open: $!"; 122: while (my $token = $p->get_token) { 123: #... 124: } 125: 126: =head1 DESCRIPTION 127: 128: The C<HTML::LCParser> is an alternative interface to the 129: C<HTML::Parser> class. It is an C<HTML::PullParser> subclass. 130: 131: The following methods are available: 132: 133: =over 4 134: 135: =item $p = HTML::LCParser->new( $file_or_doc ); 136: 137: The object constructor argument is either a file name, a file handle 138: object, or the complete document to be parsed. 139: 140: If the argument is a plain scalar, then it is taken as the name of a 141: file to be opened and parsed. If the file can't be opened for 142: reading, then the constructor will return an undefined value and $! 143: will tell you why it failed. 144: 145: If the argument is a reference to a plain scalar, then this scalar is 146: taken to be the literal document to parse. The value of this 147: scalar should not be changed before all tokens have been extracted. 148: 149: Otherwise the argument is taken to be some object that the 150: C<HTML::LCParser> can read() from when it needs more data. Typically 151: it will be a filehandle of some kind. The stream will be read() until 152: EOF, but not closed. 153: 154: It also will turn attr_encoded on by default. 155: 156: =item $p->get_token 157: 158: This method will return the next I<token> found in the HTML document, 159: or C<undef> at the end of the document. The token is returned as an 160: array reference. The first element of the array will be a (mostly) 161: single character string denoting the type of this token: "S" for start 162: tag, "E" for end tag, "T" for text, "C" for comment, "D" for 163: declaration, and "PI" for process instructions. The rest of the array 164: is the same as the arguments passed to the corresponding HTML::Parser 165: v2 compatible callbacks (see L<HTML::Parser>). In summary, returned 166: tokens look like this: 167: 168: ["S", $tag, $attr, $attrseq, $text, $line] 169: ["E", $tag, $text, $line] 170: ["T", $text, $is_data, $line] 171: ["C", $text, $line] 172: ["D", $text, $line] 173: ["PI", $token0, $text, $line] 174: 175: where $attr is a hash reference, $attrseq is an array reference and 176: the rest are plain scalars. 177: 178: =item $p->unget_token($token,...) 179: 180: If you find out you have read too many tokens you can push them back, 181: so that they are returned the next time $p->get_token is called. 182: 183: =item $p->get_tag( [$tag, ...] ) 184: 185: This method returns the next start or end tag (skipping any other 186: tokens), or C<undef> if there are no more tags in the document. If 187: one or more arguments are given, then we skip tokens until one of the 188: specified tag types is found. For example: 189: 190: $p->get_tag("font", "/font"); 191: 192: will find the next start or end tag for a font-element. 193: 194: The tag information is returned as an array reference in the same form 195: as for $p->get_token above, but the type code (first element) is 196: missing. A start tag will be returned like this: 197: 198: [$tag, $attr, $attrseq, $text] 199: 200: The tagname of end tags are prefixed with "/", i.e. end tag is 201: returned like this: 202: 203: ["/$tag", $text] 204: 205: =item $p->get_text( [$endtag] ) 206: 207: This method returns all text found at the current position. It will 208: return a zero length string if the next token is not text. The 209: optional $endtag argument specifies that any text occurring before the 210: given tag is to be returned. All entities are unmodified. 211: 212: The $p->{textify} attribute is a hash that defines how certain tags can 213: be treated as text. If the name of a start tag matches a key in this 214: hash then this tag is converted to text. The hash value is used to 215: specify which tag attribute to obtain the text from. If this tag 216: attribute is missing, then the upper case name of the tag enclosed in 217: brackets is returned, e.g. "[IMG]". The hash value can also be a 218: subroutine reference. In this case the routine is called with the 219: start tag token content as its argument and the return value is treated 220: as the text. 221: 222: The default $p->{textify} value is: 223: 224: {img => "alt", applet => "alt"} 225: 226: This means that <IMG> and <APPLET> tags are treated as text, and that 227: the text to substitute can be found in the ALT attribute. 228: 229: =item $p->get_trimmed_text( [$endtag] ) 230: 231: Same as $p->get_text above, but will collapse any sequences of white 232: space to a single space character. Leading and trailing white space is 233: removed. 234: 235: =back 236: 237: =head1 EXAMPLES 238: 239: This example extracts all links from a document. It will print one 240: line for each link, containing the URL and the textual description 241: between the <A>...</A> tags: 242: 243: use HTML::LCParser; 244: $p = HTML::LCParser->new(shift||"index.html"); 245: 246: while (my $token = $p->get_tag("a")) { 247: my $url = $token->[1]{href} || "-"; 248: my $text = $p->get_trimmed_text("/a"); 249: print "$url\t$text\n"; 250: } 251: 252: This example extract the <TITLE> from the document: 253: 254: use HTML::LCParser; 255: $p = HTML::LCParser->new(shift||"index.html"); 256: if ($p->get_tag("title")) { 257: my $title = $p->get_trimmed_text; 258: print "Title: $title\n"; 259: } 260: 261: =head1 SEE ALSO 262: 263: L<HTML::PullParser>, L<HTML::Parser> 264: 265: =head1 COPYRIGHT 266: 267: Copyright 1998-2001 Gisle Aas. 268: 269: This library is free software; you can redistribute it and/or 270: modify it under the same terms as Perl itself. 271: 272: =cut