Class REXML::Parsers::BaseParser
In: lib/rexml/parsers/baseparser.rb
Parent: Object

Using the Pull Parser

This API is experimental, and subject to change.

 parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
 while parser.has_next?
   res = parser.next
   puts res[1]['att'] if res.start_tag? and res[0] == 'b'
 end

See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.

Notice that:

 parser = PullParser.new( "<a>BAD DOCUMENT" )
 while parser.has_next?
   res = parser.next
   raise res[1] if res.error?
 end

Nat Price gave me some good ideas for the API.

Methods

add_listener   empty?   entity   has_next?   new   normalize   peek   position   pull   stream=   unnormalize   unshift  

Constants

NCNAME_STR = '[\w:][\-\w\d.]*'
NAME_STR = "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
UNAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
NAMECHAR = '[\-\w\d\.:]'
NAME = "([\\w:]#{NAMECHAR}*)"
NMTOKEN = "(?:#{NAMECHAR})+"
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A<!--/u
COMMENT_PATTERN = /<!--(.*?)-->/um
CDATA_START = /\A<!\[CDATA\[/u
CDATA_END = /^\s*\]\s*>/um
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START = /\A<\?xml\s/u;
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
INSTRUCTION_START = /\A<\?/u
INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
ENTITY_START = /^\s*<!ENTITY/
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
ELEMENTDECL_START = /^\s*<!ELEMENT/um
ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
ATTDEF_RE = /#{ATTDEF}/
ATTLISTDECL_START = /^\s*<!ATTLIST/um
ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NOTATIONDECL_START = /^\s*<!NOTATION/um
PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
TEXT_PATTERN = /\A([^<]*)/um
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"   Entity constants
SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
NDATADECL = "\\s+NDATA\\s+#{NAME}"
PEREFERENCE = "%#{NAME};"
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
EREFERENCE = /&(?!#{NAME};)/
DEFAULT_ENTITIES = { 'gt' => [/&gt;/, '&gt;', '>', />/], 'lt' => [/&lt;/, '&lt;', '<', /</], 'quot' => [/&quot;/, '&quot;', '"', /"/], "apos" => [/&apos;/, "&apos;", "'", /'/]
MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um   These are patterns to identify common markup errors, to make the error messages more informative.

Attributes

source  [R] 

Public Class methods

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 109
109:       def initialize( source )
110:         self.stream = source
111:       end

Public Instance methods

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 113
113:       def add_listener( listener )
114:         if !defined?(@listeners) or !@listeners
115:           @listeners = []
116:           instance_eval "alias :_old_pull :pull\ndef pull\nevent = _old_pull\n@listeners.each do |listener|\nlistener.receive event\nend\nevent\nend\n"
117:         end
118:         @listeners << listener
119:       end

Returns true if there are no more events

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 153
153:       def empty?
154:         return (@source.empty? and @stack.empty?)
155:       end

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 424
424:       def entity( reference, entities )
425:         value = nil
426:         value = entities[ reference ] if entities
427:         if not value
428:           value = DEFAULT_ENTITIES[ reference ]
429:           value = value[2] if value
430:         end
431:         unnormalize( value, entities ) if value
432:       end

Returns true if there are more events. Synonymous with !empty?

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 158
158:       def has_next?
159:         return !(@source.empty? and @stack.empty?)
160:       end

Escapes all possible entities

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 435
435:       def normalize( input, entities=nil, entity_filter=nil )
436:         copy = input.clone
437:         # Doing it like this rather than in a loop improves the speed
438:         copy.gsub!( EREFERENCE, '&amp;' )
439:         entities.each do |key, value|
440:           copy.gsub!( value, "&#{key};" ) unless entity_filter and 
441:                                       entity_filter.include?(entity)
442:         end if entities
443:         copy.gsub!( EREFERENCE, '&amp;' )
444:         DEFAULT_ENTITIES.each do |key, value|
445:           copy.gsub!( value[3], value[1] )
446:         end
447:         copy
448:       end

Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 174
174:       def peek depth=0
175:         raise %Q[Illegal argument "#{depth}"] if depth < -1
176:         temp = []
177:         if depth == -1
178:           temp.push(pull()) until empty?
179:         else
180:           while @stack.size+temp.size < depth+1
181:             temp.push(pull())
182:           end
183:         end
184:         @stack += temp if temp.size > 0
185:         @stack[depth]
186:       end

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 143
143:       def position
144:         if @source.respond_to? :position
145:           @source.position
146:         else
147:           # FIXME
148:           0
149:         end
150:       end

Returns the next event. This is a PullEvent object.

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 189
189:       def pull
190:         if @closed
191:           x, @closed = @closed, nil
192:           return [ :end_element, x ]
193:         end
194:         return [ :end_document ] if empty?
195:         return @stack.shift if @stack.size > 0
196:         #STDERR.puts @source.encoding
197:         @source.read if @source.buffer.size<2
198:         #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
199:         if @document_status == nil
200:           #@source.consume( /^\s*/um )
201:           word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
202:           word = word[1] unless word.nil?
203:           #STDERR.puts "WORD = #{word.inspect}"
204:           case word
205:           when COMMENT_START
206:             return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
207:           when XMLDECL_START
208:             #STDERR.puts "XMLDECL"
209:             results = @source.match( XMLDECL_PATTERN, true )[1]
210:             version = VERSION.match( results )
211:             version = version[1] unless version.nil?
212:             encoding = ENCODING.match(results)
213:             encoding = encoding[1] unless encoding.nil?
214:             @source.encoding = encoding
215:             standalone = STANDALONE.match(results)
216:             standalone = standalone[1] unless standalone.nil?
217:             return [ :xmldecl, version, encoding, standalone ]
218:           when INSTRUCTION_START
219:             return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
220:           when DOCTYPE_START
221:             md = @source.match( DOCTYPE_PATTERN, true )
222:             @nsstack.unshift(curr_ns=Set.new)
223:             identity = md[1]
224:             close = md[2]
225:             identity =~ IDENTITY
226:             name = $1
227:             raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
228:             pub_sys = $2.nil? ? nil : $2.strip
229:             long_name = $4.nil? ? nil : $4.strip
230:             uri = $6.nil? ? nil : $6.strip
231:             args = [ :start_doctype, name, pub_sys, long_name, uri ]
232:             if close == ">"
233:               @document_status = :after_doctype
234:               @source.read if @source.buffer.size<2
235:               md = @source.match(/^\s*/um, true)
236:               @stack << [ :end_doctype ]
237:             else
238:               @document_status = :in_doctype
239:             end
240:             return args
241:           when /^\s+/
242:           else
243:             @document_status = :after_doctype
244:             @source.read if @source.buffer.size<2
245:             md = @source.match(/\s*/um, true)
246:           end
247:         end
248:         if @document_status == :in_doctype
249:           md = @source.match(/\s*(.*?>)/um)
250:           case md[1]
251:           when SYSTEMENTITY 
252:             match = @source.match( SYSTEMENTITY, true )[1]
253:             return [ :externalentity, match ]
254: 
255:           when ELEMENTDECL_START
256:             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
257: 
258:           when ENTITY_START
259:             match = @source.match( ENTITYDECL, true ).to_a.compact
260:             match[0] = :entitydecl
261:             ref = false
262:             if match[1] == '%'
263:               ref = true
264:               match.delete_at 1
265:             end
266:             # Now we have to sort out what kind of entity reference this is
267:             if match[2] == 'SYSTEM'
268:               # External reference
269:               match[3] = match[3][1..-2] # PUBID
270:               match.delete_at(4) if match.size > 4 # Chop out NDATA decl
271:               # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
272:             elsif match[2] == 'PUBLIC'
273:               # External reference
274:               match[3] = match[3][1..-2] # PUBID
275:               match[4] = match[4][1..-2] # HREF
276:               # match is [ :entity, name, PUBLIC, pubid, href ]
277:             else
278:               match[2] = match[2][1..-2]
279:               match.pop if match.size == 4
280:               # match is [ :entity, name, value ]
281:             end
282:             match << '%' if ref
283:             return match
284:           when ATTLISTDECL_START
285:             md = @source.match( ATTLISTDECL_PATTERN, true )
286:             raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
287:             element = md[1]
288:             contents = md[0]
289: 
290:             pairs = {}
291:             values = md[0].scan( ATTDEF_RE )
292:             values.each do |attdef|
293:               unless attdef[3] == "#IMPLIED"
294:                 attdef.compact!
295:                 val = attdef[3]
296:                 val = attdef[4] if val == "#FIXED "
297:                 pairs[attdef[0]] = val
298:                 if attdef[0] =~ /^xmlns:(.*)/
299:                   @nsstack[0] << $1
300:                 end
301:               end
302:             end
303:             return [ :attlistdecl, element, pairs, contents ]
304:           when NOTATIONDECL_START
305:             md = nil
306:             if @source.match( PUBLIC )
307:               md = @source.match( PUBLIC, true )
308:               vals = [md[1],md[2],md[4],md[6]]
309:             elsif @source.match( SYSTEM )
310:               md = @source.match( SYSTEM, true )
311:               vals = [md[1],md[2],nil,md[4]]
312:             else
313:               raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
314:             end
315:             return [ :notationdecl, *vals ]
316:           when CDATA_END
317:             @document_status = :after_doctype
318:             @source.match( CDATA_END, true )
319:             return [ :end_doctype ]
320:           end
321:         end
322:         begin
323:           if @source.buffer[0] == ?<
324:             if @source.buffer[1] == ?/
325:               @nsstack.shift
326:               last_tag = @tags.pop
327:               #md = @source.match_to_consume( '>', CLOSE_MATCH)
328:               md = @source.match( CLOSE_MATCH, true )
329:               raise REXML::ParseException.new( "Missing end tag for "+
330:                 "'#{last_tag}' (got \"#{md[1]}\")", 
331:                 @source) unless last_tag == md[1]
332:               return [ :end_element, last_tag ]
333:             elsif @source.buffer[1] == ?!
334:               md = @source.match(/\A(\s*[^>]*>)/um)
335:               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
336:               raise REXML::ParseException.new("Malformed node", @source) unless md
337:               if md[0][2] == ?-
338:                 md = @source.match( COMMENT_PATTERN, true )
339:                 return [ :comment, md[1] ] if md
340:               else
341:                 md = @source.match( CDATA_PATTERN, true )
342:                 return [ :cdata, md[1] ] if md
343:               end
344:               raise REXML::ParseException.new( "Declarations can only occur "+
345:                 "in the doctype declaration.", @source)
346:             elsif @source.buffer[1] == ??
347:               md = @source.match( INSTRUCTION_PATTERN, true )
348:               return [ :processing_instruction, md[1], md[2] ] if md
349:               raise REXML::ParseException.new( "Bad instruction declaration",
350:                 @source)
351:             else
352:               # Get the next tag
353:               md = @source.match(TAG_MATCH, true)
354:               unless md
355:                 # Check for missing attribute quotes
356:                 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
357:                 raise REXML::ParseException.new("malformed XML: missing tag start", @source) 
358:               end
359:               attributes = {}
360:               prefixes = Set.new
361:               prefixes << md[2] if md[2]
362:               @nsstack.unshift(curr_ns=Set.new)
363:               if md[4].size > 0
364:                 attrs = md[4].scan( ATTRIBUTE_PATTERN )
365:                 raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
366:                 attrs.each { |a,b,c,d,e| 
367:                   if b == "xmlns"
368:                     if c == "xml"
369:                       if d != "http://www.w3.org/XML/1998/namespace"
370:                         msg = "The 'xml' prefix must not be bound to any other namespace "+
371:                         "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
372:                         raise REXML::ParseException.new( msg, @source, self )
373:                       end
374:                     elsif c == "xmlns"
375:                       msg = "The 'xmlns' prefix must not be declared "+
376:                       "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
377:                       raise REXML::ParseException.new( msg, @source, self)
378:                     end
379:                     curr_ns << c
380:                   elsif b
381:                     prefixes << b unless b == "xml"
382:                   end
383:                   attributes[a] = e 
384:                 }
385:               end
386:         
387:               # Verify that all of the prefixes have been defined
388:               for prefix in prefixes
389:                 unless @nsstack.find{|k| k.member?(prefix)}
390:                   raise UndefinedNamespaceException.new(prefix,@source,self)
391:                 end
392:               end
393: 
394:               if md[6]
395:                 @closed = md[1]
396:                 @nsstack.shift
397:               else
398:                 @tags.push( md[1] )
399:               end
400:               return [ :start_element, md[1], attributes ]
401:             end
402:           else
403:             md = @source.match( TEXT_PATTERN, true )
404:             if md[0].length == 0
405:               @source.match( /(\s+)/, true )
406:             end
407:             #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
408:             #return [ :text, "" ] if md[0].length == 0
409:             # unnormalized = Text::unnormalize( md[1], self )
410:             # return PullEvent.new( :text, md[1], unnormalized )
411:             return [ :text, md[1] ]
412:           end
413:         rescue REXML::UndefinedNamespaceException
414:           raise
415:         rescue REXML::ParseException
416:           raise
417:         rescue Exception, NameError => error
418:           raise REXML::ParseException.new( "Exception parsing",
419:             @source, self, (error ? error : $!) )
420:         end
421:         return [ :dummy ]
422:       end

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 133
133:       def stream=( source )
134:         @source = SourceFactory.create_from( source )
135:         @closed = nil
136:         @document_status = nil
137:         @tags = []
138:         @stack = []
139:         @entities = []
140:         @nsstack = []
141:       end

Unescapes all possible entities

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 451
451:       def unnormalize( string, entities=nil, filter=nil )
452:         rv = string.clone
453:         rv.gsub!( /\r\n?/, "\n" )
454:         matches = rv.scan( REFERENCE_RE )
455:         return rv if matches.size == 0
456:         rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
457:           m=$1
458:           m = "0#{m}" if m[0] == ?x
459:           [Integer(m)].pack('U*')
460:         }
461:         matches.collect!{|x|x[0]}.compact!
462:         if matches.size > 0
463:           matches.each do |entity_reference|
464:             unless filter and filter.include?(entity_reference)
465:               entity_value = entity( entity_reference, entities )
466:               if entity_value
467:                 re = /&#{entity_reference};/
468:                 rv.gsub!( re, entity_value )
469:               end
470:             end
471:           end
472:           matches.each do |entity_reference|
473:             unless filter and filter.include?(entity_reference)
474:               er = DEFAULT_ENTITIES[entity_reference]
475:               rv.gsub!( er[0], er[2] ) if er
476:             end
477:           end
478:           rv.gsub!( /&amp;/, '&' )
479:         end
480:         rv
481:       end

Push an event back on the head of the stream. This method has (theoretically) infinite depth.

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 164
164:       def unshift token
165:         @stack.unshift(token)
166:       end

[Validate]