001 /* 002 * To change this template, choose Tools | Templates 003 * and open the template in the editor. 004 */ 005 006 package org.util.xml.parse.policy; 007 008 import org.util.xml.parse.policy.ParserPolicy; 009 import javax.swing.JOptionPane; 010 import org.util.xml.element.Element; 011 import org.util.xml.element.TagElement; 012 013 /** 014 * 015 * @author masaru 016 */ 017 public class HTMLParserPolicy extends DefaultParserPolicy { 018 019 protected String[] forse_empty_tag_list_ = {"br","hr","meta","link","img","input","base","dd","dt","frame","p","pre","li","space"}; 020 protected String encoding_ = null; 021 022 public boolean checkEndTagMatch() { 023 return false; 024 } 025 public boolean forceEmptyTag(String key) { 026 for(int i=0;i<forse_empty_tag_list_.length;i++) 027 if(forse_empty_tag_list_[i].equals(key.toLowerCase())) 028 return true; 029 return false; 030 } 031 032 public Element allowElement(Element element) { 033 // JOptionPane.showMessageDialog(null, "check\n"+element); 034 if(encoding_ == null && element.isTagElement()) { 035 TagElement telement = (TagElement)element; 036 if(telement.getKey().toLowerCase().equals("meta")) { 037 if("content-type".equals(telement.getAttributeValue("http-equiv","").toLowerCase())) { 038 String contenttext = telement.getAttributeValue("content"); 039 String encoding = null; 040 int point = contenttext.indexOf("charset"); 041 if(point != -1) { 042 for(int i=point+"charset".length();i<contenttext.length()&&encoding==null;i++) { 043 char c = contenttext.charAt(i); 044 if(c!=' ' && c!='=') 045 encoding = contenttext.substring(i, contenttext.length()); 046 } 047 System.out.println("found encoding: "+ encoding); 048 encoding_ = encoding; 049 } 050 } 051 } 052 } 053 // JOptionPane.showMessageDialog(null, "encoding:"+encoding_); 054 return element; 055 } 056 057 public String selectEncoding(String last_tag_key) { 058 if(last_tag_key!=null && last_tag_key.toLowerCase().equals("body")) { 059 encoding_ = "JISAutoDetect"; 060 encoding_ = JOptionPane.showInputDialog("<html>encoding does not defained before reading body tag<br/>select encoding</html>",encoding_); 061 } 062 return encoding_; 063 } 064 }