class HTMLTokenizer

A class to tokenize HTML.

Example:

page = "<HTML>
<HEAD>
<TITLE>This is the title</TITLE>
</HEAD>
 <!-- Here comes the <a href=\"missing.link\">blah</a>
 comment body
  -->
 <BODY>
   <H1>This is the header</H1>
   <P>
     This is the paragraph, it contains
     <a href=\"link.html\">links</a>,
     <img src=\"blah.gif\" optional alt='images
     are
     really cool'>.  Ok, here is some more text and
     <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
   </P>
 </body>
 </HTML>
 "
 toke = HTMLTokenizer.new(page)

 assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
 assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
 assert("links" == toke.getTrimmedText)
 assert(toke.getTag("IMG", "A").attr_hash['optional'])
 assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])

Attributes

page[R]

Public Class Methods

new(content) click to toggle source

Create a new tokenizer, based on the content, used as a string.

# File lib/openid/yadis/htmltokenizer.rb, line 57
def initialize(content)
  @page = content.to_s
  @cur_pos = 0
end
version() click to toggle source

Get version of HTMLTokenizer lib

# File lib/openid/yadis/htmltokenizer.rb, line 50
def self.version
  @@version
end

Public Instance Methods

getNextToken() click to toggle source

Get the next token, returns an instance of

# File lib/openid/yadis/htmltokenizer.rb, line 103
def getNextToken
  token = peekNextToken
  if token
    # @page = @page[token.raw.length .. -1]
    # @page.slice!(0, token.raw.length)
    @cur_pos += token.raw.length
  end
  #p token
  #print token.raw
  return token
end
getTag(*sought_tags) click to toggle source

Get a tag from the specified set of desired tags. For example: foo = toke.getTag("h1", "h2", "h3") Will return the next header tag encountered.

# File lib/openid/yadis/htmltokenizer.rb, line 119
def getTag(*sought_tags)
  sought_tags.collect! {|elm| elm.downcase}

  while (tag = getNextToken)
    if tag.kind_of?(HTMLTag) and
        (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
      break
    end
  end
  tag
end
getText(until_tag = nil) click to toggle source

Get all the text between the current position and the next tag (if specified) or a specific later tag

# File lib/openid/yadis/htmltokenizer.rb, line 133
def getText(until_tag = nil)
  if until_tag.nil?
    if ?< == @page[@cur_pos]
      # Next token is a tag, not text
      ""
    else
      # Next token is text
      getNextToken.text
    end
  else
    ret_str = ""

    while (tag = peekNextToken)
      if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
        break
      end

      if ("" != tag.text)
        ret_str << (tag.text + " ")
      end
      getNextToken
    end

    ret_str
  end
end
getTrimmedText(until_tag = nil) click to toggle source

Like getText, but squeeze all whitespace, getting rid of leading and trailing whitespace, and squeezing multiple spaces into a single space.

# File lib/openid/yadis/htmltokenizer.rb, line 163
def getTrimmedText(until_tag = nil)
  getText(until_tag).strip.gsub(/\s+/m, " ")
end
peekNextToken() click to toggle source

Look at the next token, but don't actually grab it

# File lib/openid/yadis/htmltokenizer.rb, line 68
def peekNextToken
  if @cur_pos == @page.length then return nil end

  if ?< == @page[@cur_pos]
    # Next token is a tag of some kind
    if '!--' == @page[(@cur_pos + 1), 3]
      # Token is a comment
      tag_end = @page.index('-->', (@cur_pos + 1))
      if tag_end.nil?
        raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
      end
      # p @page[@cur_pos .. (tag_end+2)]
      HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
    else
      # Token is a html tag
      tag_end = @page.index('>', (@cur_pos + 1))
      if tag_end.nil?
        raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
      end
      # p @page[@cur_pos .. tag_end]
      HTMLTag.new(@page[@cur_pos .. tag_end])
    end
  else
    # Next token is text
    text_end = @page.index('<', @cur_pos)
    text_end = text_end.nil? ? -1 : (text_end - 1)
    # p @page[@cur_pos .. text_end]
    HTMLText.new(@page[@cur_pos .. text_end])
  end
end
reset() click to toggle source

Reset the parser, setting the current position back at the stop

# File lib/openid/yadis/htmltokenizer.rb, line 63
def reset
  @cur_pos = 0
end