Save This Page
Home » openjdk-7 » javax » swing » text » html » parser » [javadoc | source]
    1   /*
    2    * Copyright 1998-2003 Sun Microsystems, Inc.  All Rights Reserved.
    3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4    *
    5    * This code is free software; you can redistribute it and/or modify it
    6    * under the terms of the GNU General Public License version 2 only, as
    7    * published by the Free Software Foundation.  Sun designates this
    8    * particular file as subject to the "Classpath" exception as provided
    9    * by Sun in the LICENSE file that accompanied this code.
   10    *
   11    * This code is distributed in the hope that it will be useful, but WITHOUT
   12    * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   13    * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   14    * version 2 for more details (a copy is included in the LICENSE file that
   15    * accompanied this code).
   16    *
   17    * You should have received a copy of the GNU General Public License version
   18    * 2 along with this work; if not, write to the Free Software Foundation,
   19    * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   20    *
   21    * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
   22    * CA 95054 USA or visit www.sun.com if you need additional information or
   23    * have any questions.
   24    */
   25   
   26   package javax.swing.text.html.parser;
   27   
   28   import javax.swing.text.SimpleAttributeSet;
   29   import javax.swing.text.html.HTMLEditorKit;
   30   import javax.swing.text.html.HTML;
   31   import javax.swing.text.ChangedCharSetException;
   32   
   33   import java.util;
   34   import java.io;
   35   import java.net;
   36   
   37   /**
   38    * A Parser for HTML Documents (actually, you can specify a DTD, but
   39    * you should really only use this class with the html dtd in swing).
   40    * Reads an InputStream of HTML and
   41    * invokes the appropriate methods in the ParserCallback class. This
   42    * is the default parser used by HTMLEditorKit to parse HTML url's.
   43    * <p>This will message the callback for all valid tags, as well as
   44    * tags that are implied but not explicitly specified. For example, the
   45    * html string (&lt;p&gt;blah) only has a p tag defined. The callback
   46    * will see the following methods:
   47    * <ol><li><i>handleStartTag(html, ...)</i></li>
   48    *     <li><i>handleStartTag(head, ...)</i></li>
   49    *     <li><i>handleEndTag(head)</i></li>
   50    *     <li><i>handleStartTag(body, ...)</i></li>
   51    *     <li>handleStartTag(p, ...)</i></li>
   52    *     <li>handleText(...)</li>
   53    *     <li><i>handleEndTag(p)</i></li>
   54    *     <li><i>handleEndTag(body)</i></li>
   55    *     <li><i>handleEndTag(html)</i></li>
   56    * </ol>
   57    * The items in <i>italic</i> are implied, that is, although they were not
   58    * explicitly specified, to be correct html they should have been present
   59    * (head isn't necessary, but it is still generated). For tags that
   60    * are implied, the AttributeSet argument will have a value of
   61    * <code>Boolean.TRUE</code> for the key
   62    * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
   63    * <p>HTML.Attributes defines a type safe enumeration of html attributes.
   64    * If an attribute key of a tag is defined in HTML.Attribute, the
   65    * HTML.Attribute will be used as the key, otherwise a String will be used.
   66    * For example &lt;p foo=bar class=neat&gt; has two attributes. foo is
   67    * not defined in HTML.Attribute, where as class is, therefore the
   68    * AttributeSet will have two values in it, HTML.Attribute.CLASS with
   69    * a String value of 'neat' and the String key 'foo' with a String value of
   70    * 'bar'.
   71    * <p>The position argument will indicate the start of the tag, comment
   72    * or text. Similiar to arrays, the first character in the stream has a
   73    * position of 0. For tags that are
   74    * implied the position will indicate
   75    * the location of the next encountered tag. In the first example,
   76    * the implied start body and html tags will have the same position as the
   77    * p tag, and the implied end p, html and body tags will all have the same
   78    * position.
   79    * <p>As html skips whitespace the position for text will be the position
   80    * of the first valid character, eg in the string '\n\n\nblah'
   81    * the text 'blah' will have a position of 3, the newlines are skipped.
   82    * <p>
   83    * For attributes that do not have a value, eg in the html
   84    * string <code>&lt;foo blah&gt;</code> the attribute <code>blah</code>
   85    * does not have a value, there are two possible values that will be
   86    * placed in the AttributeSet's value:
   87    * <ul>
   88    * <li>If the DTD does not contain an definition for the element, or the
   89    *     definition does not have an explicit value then the value in the
   90    *     AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
   91    * <li>If the DTD contains an explicit value, as in:
   92    *     <code>&lt;!ATTLIST OPTION selected (selected) #IMPLIED&gt;</code>
   93    *     this value from the dtd (in this case selected) will be used.
   94    * </ul>
   95    * <p>
   96    * Once the stream has been parsed, the callback is notified of the most
   97    * likely end of line string. The end of line string will be one of
   98    * \n, \r or \r\n, which ever is encountered the most in parsing the
   99    * stream.
  100    *
  101    * @author      Sunita Mani
  102    */
  103   public class DocumentParser extends javax.swing.text.html.parser.Parser {
  104   
  105       private int inbody;
  106       private int intitle;
  107       private int inhead;
  108       private int instyle;
  109       private int inscript;
  110       private boolean seentitle;
  111       private HTMLEditorKit.ParserCallback callback = null;
  112       private boolean ignoreCharSet = false;
  113       private static final boolean debugFlag = false;
  114   
  115       public DocumentParser(DTD dtd) {
  116           super(dtd);
  117       }
  118   
  119       public void parse(Reader in,  HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
  120           this.ignoreCharSet = ignoreCharSet;
  121           this.callback = callback;
  122           parse(in);
  123           // end of line
  124           callback.handleEndOfLineString(getEndOfLineString());
  125       }
  126   
  127       /**
  128        * Handle Start Tag.
  129        */
  130       protected void handleStartTag(TagElement tag) {
  131   
  132           Element elem = tag.getElement();
  133           if (elem == dtd.body) {
  134               inbody++;
  135           } else if (elem == dtd.html) {
  136           } else if (elem == dtd.head) {
  137               inhead++;
  138           } else if (elem == dtd.title) {
  139               intitle++;
  140           } else if (elem == dtd.style) {
  141               instyle++;
  142           } else if (elem == dtd.script) {
  143               inscript++;
  144           }
  145           if (debugFlag) {
  146               if (tag.fictional()) {
  147                   debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  148               } else {
  149                   debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
  150                         getAttributes() + " pos: " + getCurrentPos());
  151               }
  152           }
  153           if (tag.fictional()) {
  154               SimpleAttributeSet attrs = new SimpleAttributeSet();
  155               attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
  156                                  Boolean.TRUE);
  157               callback.handleStartTag(tag.getHTMLTag(), attrs,
  158                                       getBlockStartPosition());
  159           } else {
  160               callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
  161                                       getBlockStartPosition());
  162               flushAttributes();
  163           }
  164       }
  165   
  166   
  167       protected void handleComment(char text[]) {
  168           if (debugFlag) {
  169               debug("comment: ->" + new String(text) + "<-"
  170                     + " pos: " + getCurrentPos());
  171           }
  172           callback.handleComment(text, getBlockStartPosition());
  173       }
  174   
  175       /**
  176        * Handle Empty Tag.
  177        */
  178       protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
  179   
  180           Element elem = tag.getElement();
  181           if (elem == dtd.meta && !ignoreCharSet) {
  182               SimpleAttributeSet atts = getAttributes();
  183               if (atts != null) {
  184                   String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
  185                   if (content != null) {
  186                       if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
  187                           if (!content.equalsIgnoreCase("text/html") &&
  188                                   !content.equalsIgnoreCase("text/plain")) {
  189                               throw new ChangedCharSetException(content, false);
  190                           }
  191                       } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
  192                           throw new ChangedCharSetException(content, true);
  193                       }
  194                   }
  195               }
  196           }
  197           if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
  198               if (debugFlag) {
  199                   if (tag.fictional()) {
  200                       debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  201                   } else {
  202                       debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
  203                             + getAttributes() + " pos: " + getCurrentPos());
  204                   }
  205               }
  206               if (tag.fictional()) {
  207                   SimpleAttributeSet attrs = new SimpleAttributeSet();
  208                   attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
  209                                      Boolean.TRUE);
  210                   callback.handleSimpleTag(tag.getHTMLTag(), attrs,
  211                                            getBlockStartPosition());
  212               } else {
  213                   callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
  214                                            getBlockStartPosition());
  215                   flushAttributes();
  216               }
  217           }
  218       }
  219   
  220       /**
  221        * Handle End Tag.
  222        */
  223       protected void handleEndTag(TagElement tag) {
  224           Element elem = tag.getElement();
  225           if (elem == dtd.body) {
  226               inbody--;
  227           } else if (elem == dtd.title) {
  228               intitle--;
  229               seentitle = true;
  230           } else if (elem == dtd.head) {
  231               inhead--;
  232           } else if (elem == dtd.style) {
  233               instyle--;
  234           } else if (elem == dtd.script) {
  235               inscript--;
  236           }
  237           if (debugFlag) {
  238               debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
  239           }
  240           callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());
  241   
  242       }
  243   
  244       /**
  245        * Handle Text.
  246        */
  247       protected void handleText(char data[]) {
  248           if (data != null) {
  249               if (inscript != 0) {
  250                   callback.handleComment(data, getBlockStartPosition());
  251                   return;
  252               }
  253               if (inbody != 0 || ((instyle != 0) ||
  254                                   ((intitle != 0) && !seentitle))) {
  255                   if (debugFlag) {
  256                       debug("text:  ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
  257                   }
  258                   callback.handleText(data, getBlockStartPosition());
  259               }
  260           }
  261       }
  262   
  263       /*
  264        * Error handling.
  265        */
  266       protected void handleError(int ln, String errorMsg) {
  267           if (debugFlag) {
  268               debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
  269           }
  270           /* PENDING: need to improve the error string. */
  271           callback.handleError(errorMsg, getCurrentPos());
  272       }
  273   
  274   
  275       /*
  276        * debug messages
  277        */
  278       private void debug(String msg) {
  279           System.out.println(msg);
  280       }
  281   }

Save This Page
Home » openjdk-7 » javax » swing » text » html » parser » [javadoc | source]