001 package org.util.html.factory; 002 003 004 import java.util.*; 005 import java.io.*; 006 import java.net.*; 007 import java.awt.*; 008 import java.awt.event.*; 009 import javax.swing.*; 010 import javax.swing.event.*; 011 012 import org.util.html.objects.*; 013 import org.util.log.*; 014 015 import org.util.xml.parse.*; 016 import org.util.xml.parse.policy.*; 017 import org.util.xml.element.*; 018 019 public class HTMLDocumentFactory { 020 021 private LogListener log_listener_; 022 private URLConnection connection_; 023 private ParserPolicy html_document_parser_policy_; 024 private ParserPolicy head_tag_parser_policy_; 025 private ParserPolicy body_tag_parser_policy_; 026 private HTMLDocument current_document_; 027 028 public HTMLDocumentFactory() { 029 030 URLConnection.setDefaultAllowUserInteraction(true); 031 // User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.13) Gecko/2009080317 Fedora/3.0.13-1.fc10 Firefox/3.0.13 GTB5 032 033 html_document_parser_policy_ = new HTMLParserPolicy(){ 034 @Override public boolean throwExceptionIfDocumentHasError() { 035 return false; 036 } 037 @Override public Element allowElement(Element element) { 038 return element; 039 } 040 @Override public ParserPolicy getInnerPolicy(Element element) { 041 if(!element.isTagElement()) return null; 042 TagElement tag = (TagElement)element; 043 if(tag.getKey().toLowerCase().equals("body")) 044 return body_tag_parser_policy_; 045 else if(tag.getKey().toLowerCase().equals("head")) 046 return head_tag_parser_policy_; 047 return this; 048 } 049 }; 050 051 head_tag_parser_policy_ = new HTMLParserPolicy(){ 052 @Override public boolean throwExceptionIfDocumentHasError() { 053 return false; 054 } 055 @Override public Element allowElement(Element element) { 056 super.allowElement(element); 057 return element; 058 } 059 }; 060 061 body_tag_parser_policy_ = new HTMLParserPolicy(){ 062 @Override public boolean throwExceptionIfDocumentHasError() { 063 return false; 064 } 065 @Override public Element allowElement(Element element) { 066 if(element.isTextElement()) { 067 TextElement text = (TextElement)element; 068 HTMLText tobj = new HTMLText(current_document_); 069 tobj.setText(text.getValue()); 070 current_document_.add(tobj); 071 return element; 072 } else { 073 TagElement tag = (TagElement)element; 074 String key = tag.getKey().toLowerCase(); 075 if(key.equals("img")) { 076 HTMLImg timg = new HTMLImg(current_document_); 077 try{ 078 timg.setURL(new URL(tag.getAttributeValue("src"))); 079 }catch(Exception e){} 080 current_document_.add(timg); 081 return element; 082 } 083 } 084 return element; 085 } 086 }; 087 } 088 089 public void setLogListener(LogListener log_listener) { 090 log_listener_ = log_listener; 091 } 092 093 public HTMLDocument createDocument(URL url, HTMLDocument doc) throws Exception { 094 connection_ = url.openConnection(); 095 return createDocument(url, connection_, connection_.getInputStream(), doc); 096 } 097 098 public HTMLDocument createDocument(URL url, URLConnection connection, InputStream is, HTMLDocument document) throws Exception { 099 assert is != null; 100 101 if(document==null) 102 document = new HTMLDocument(); 103 104 current_document_ = document; 105 current_document_.clear(); 106 current_document_.setDocumentBase(url); 107 connection_ = connection; 108 ElementParser parser = null; 109 String encoding = null; 110 if(connection_!=null) 111 encoding = connection_.getContentEncoding(); 112 if(encoding != null) 113 parser = new ElementParser(is, encoding); 114 else 115 parser = new ElementParser(is); 116 117 parser.setPolicy(html_document_parser_policy_); 118 119 Element[] element_list = parser.parse(); 120 121 System.out.println("skipped:"); 122 System.out.println("---------------------"); 123 //for(Element element : element_list) 124 // System.out.println(element); 125 126 return current_document_; 127 } 128 129 }