Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: gnu/javax/swing/text/html/parser/support/low/Constants.java


1   /* Constants.java --
2      Copyright (C) 2005 Free Software Foundation, Inc.
3   
4   This file is part of GNU Classpath.
5   
6   GNU Classpath is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10  
11  GNU Classpath is distributed in the hope that it will be useful, but
12  WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  General Public License for more details.
15  
16  You should have received a copy of the GNU General Public License
17  along with GNU Classpath; see the file COPYING.  If not, write to the
18  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  02110-1301 USA.
20  
21  Linking this library statically or dynamically with other modules is
22  making a combined work based on this library.  Thus, the terms and
23  conditions of the GNU General Public License cover the whole
24  combination.
25  
26  As a special exception, the copyright holders of this library give you
27  permission to link this library with independent modules to produce an
28  executable, regardless of the license terms of these independent
29  modules, and to copy and distribute the resulting executable under
30  terms of your choice, provided that you also meet, for each linked
31  independent module, the terms and conditions of the license of that
32  module.  An independent module is a module which is not derived from
33  or based on this library.  If you modify this library, you may extend
34  this exception to your version of the library, but you are not
35  obligated to do so.  If you do not wish to do so, delete this
36  exception statement from your version. */
37  
38  
39  package gnu.javax.swing.text.html.parser.support.low;
40  
41  import java.util.BitSet;
42  
43  /**
44   * The parser constants and operations, directly related to the parser
45   * constants.
46   * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
47   */
48  public class Constants
49  {
50    /* Single character tokens are reflected into they ASCII codes. */
51  
52    /**
53     * Start of HTML token.
54     */
55    public static final int BEGIN = '<';
56  
57    /**
58     * End of HTML token.
59     */
60    public static final int END = '>';
61  
62    /**
63     * Exclamation (indicates SGML or comment).
64     */
65    public static final int EXCLAMATION = '!';
66  
67    /**
68     * Slash (indicates closing tag).
69     */
70    public static final int SLASH = '/';
71  
72    /**
73     * Equals sign.
74     */
75    public static final int EQ = '=';
76  
77    /**
78     * Quoting sign.
79     */
80    public static final int AP = '\'';
81  
82    /**
83     * Quoting sign.
84     */
85    public static final int QUOT = '"';
86  
87    /* The numbers of other tokens start outside the ascii space. */
88    /* String tokens */
89  
90    /**
91     * Double dash (--)
92     */
93    public static final int DOUBLE_DASH = 1000;
94  
95    /**
96     * The STYLE tag (needs special handling).
97     */
98    public static final int STYLE = 1001;
99  
100   /**
101    * The SCRIPT tag (needs special handling).
102    */
103   public static final int SCRIPT = 1002;
104 
105   /* Pattern tokens */
106 
107   /**
108    * HTML whitespace.
109    */
110   public static final int WS = 1003;
111 
112   /**
113    * Named or numeric entity,
114    */
115   public static final int ENTITY = 1004;
116 
117   /**
118    * Sequence of valid name characters (can start from digit).
119    */
120   public static final int NUMTOKEN = 1005;
121 
122   /* Complex tokens */
123 
124   /**
125    * Comment opening sequence.
126    */
127   public static final pattern COMMENT_OPEN =
128     new pattern(new node[]
129                 {
130                   new node(BEGIN), new node(WS, true), new node(EXCLAMATION),
131                   new node(WS, true), new node(DOUBLE_DASH),
132                 }
133                );
134 
135   /**
136    * Comment closing sequence
137    */
138   public static final pattern COMMENT_END =
139     new pattern(new node[]
140                 {
141                   new node(DOUBLE_DASH), new node(WS, true), new node(END)
142                 }
143                );
144 
145   /**
146    * Special case ---> (also is treated as end of comment).
147    */
148   public static final pattern COMMENT_TRIPLEDASH_END =
149     new pattern(new node[]
150                 {
151                   new node(DOUBLE_DASH), new node(NUMTOKEN), new node(END)
152                 }
153                );
154 
155   /**
156    * STYLE element heading pattern.
157    */
158   public static final pattern STYLE_OPEN =
159     new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(STYLE) });
160 
161   /**
162    * SCRIPT element heading pattern.
163    */
164   public static final pattern SCRIPT_OPEN =
165     new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(SCRIPT) });
166 
167   /**
168    * SGML element heading pattern.
169    */
170   public static final pattern SGML =
171     new pattern(new node[]
172                 {
173                   new node(BEGIN), new node(WS, true), new node(EXCLAMATION)
174                 }
175                );
176 
177   /**
178    * SCRIPT element closing pattern.
179    */
180   public static final pattern SCRIPT_CLOSE =
181     new pattern(new node[]
182                 {
183                   new node(BEGIN), new node(WS, true), new node(SLASH),
184                   new node(WS, true), new node(SCRIPT), new node(WS, true),
185                   new node(END)
186                 }
187                );
188 
189   /**
190    * STYLE element closing pattern.
191    */
192   public static final pattern STYLE_CLOSE =
193     new pattern(new node[]
194                 {
195                   new node(BEGIN), new node(WS, true), new node(SLASH),
196                   new node(WS, true), new node(STYLE), new node(WS, true),
197                   new node(END)
198                 }
199                );
200 
201   /**
202    * Ordinary HTML tag heading pattern.
203    */
204   public static final pattern TAG =
205     new pattern(new node[]
206                 {
207                   new node(BEGIN), new node(WS, true), new node(SLASH, true),
208                   new node(WS, true), new node(NUMTOKEN)
209                 }
210                );
211 
212   /* Special tokens */
213 
214   /**
215    * All other tokens.
216    */
217   public static final int OTHER = 1999;
218 
219   /**
220    * The UNICODE "end of text" control code
221    */
222   static final char ETX = 3;
223 
224   /**
225    * End of file.
226    */
227   public static final int EOF = ETX;
228 
229   /* Character categories */
230 
231   /**
232    * All single char tokens.
233    */
234   public static final BitSet bSINGLE_CHAR_TOKEN = new BitSet();
235 
236   /**
237    * Non letters and non numbers, allowed in HTML names.
238    */
239   public static final BitSet bSPECIAL = new BitSet();
240 
241   /**
242    * All letters, used in HTML names.
243    */
244   public static final BitSet bLETTER = new BitSet();
245 
246   /**
247    * Digits.
248    */
249   public static final BitSet bDIGIT = new BitSet();
250 
251   /**
252    * Both line breaks.
253    */
254   public static final BitSet bLINEBREAK = new BitSet();
255 
256   /**
257    * All whitespace.
258    */
259   public static final BitSet bWHITESPACE = new BitSet();
260 
261   /**
262    * Both quoting characters.
263    */
264   public static final BitSet bQUOTING = new BitSet();
265 
266   /**
267    * Valid name characters.
268    */
269   public static final BitSet bNAME = new BitSet();
270 
271   /* Entity subcategories */
272 
273   /**
274    * Named entity.
275    */
276   public static final int ENTITY_NAMED = 1;
277 
278   /**
279    * Numeric entity.
280    */
281   public static final int ENTITY_NUMERIC = 2;
282 
283   static
284   {
285     bQUOTING.set(AP);
286     bQUOTING.set(QUOT);
287 
288     bSINGLE_CHAR_TOKEN.set(BEGIN);
289     bSINGLE_CHAR_TOKEN.set(END);
290     bSINGLE_CHAR_TOKEN.set(EXCLAMATION);
291     bSINGLE_CHAR_TOKEN.set(SLASH);
292     bSINGLE_CHAR_TOKEN.set(EQ);
293     bSINGLE_CHAR_TOKEN.set(EOF);
294 
295     bSINGLE_CHAR_TOKEN.or(bQUOTING);
296 
297     bLINEBREAK.set('\r');
298     bLINEBREAK.set('\n');
299 
300     bWHITESPACE.set(' ');
301     bWHITESPACE.set('\t');
302     bWHITESPACE.set(0xC);
303     bWHITESPACE.or(bLINEBREAK);
304 
305     for (char i = '0'; i <= '9'; i++)
306       {
307         bDIGIT.set(i);
308       }
309 
310     for (char i = 'a'; i <= 'z'; i++)
311       {
312         bLETTER.set(i);
313       }
314 
315     for (char i = 'A'; i <= 'Z'; i++)
316       {
317         bLETTER.set(i);
318       }
319 
320     bSPECIAL.set('-');
321     bSPECIAL.set('_');
322     bSPECIAL.set(':');
323     bSPECIAL.set('.');
324 
325     bNAME.or(bLETTER);
326     bNAME.or(bDIGIT);
327     bNAME.or(bSPECIAL);
328   }
329 
330   /**
331    * Verifies if one of the tokens matches the end of string
332    * buffer. The last character in the string buffer is the
333    * "future character", some tokens needs to verify it the
334    * token does not continue "towards the future". If the token
335    * matches, it matches till "pre-last" character in the buffer.
336    * @param b
337    * @return
338    */
339   public Token endMatches(Buffer b)
340   {
341     if (b.length() < 2)
342       return null;
343 
344     int p = b.length() - 2;
345 
346     if (b.length() > 2 && b.charAt(p) == '-' && b.charAt(p - 1) == '-')
347       return new Token(DOUBLE_DASH, "--", b.getLocation(p - 1, p + 1));
348 
349     char last = b.charAt(p);
350 
351     if (bSINGLE_CHAR_TOKEN.get(last))
352       return new Token(last, last, b.getLocation(p, p + 1));
353 
354     char future = b.charAt(p + 1);
355 
356     // Check for numtokens, script and style:
357     if (bNAME.get(last) && !bNAME.get(future))
358       {
359         // Scan the history up:
360         int u = p - 1;
361         while (u >= 0 && bNAME.get(b.charAt(u)))
362           u--;
363         u++;
364 
365         char[] token = new char[ p - u + 1 ];
366 
367         // Found a numtoken
368         b.getChars(u, p + 1, token, 0);
369 
370         // Verify for the built-in tokens:
371         String e = new String(token);
372 
373         // found the entity reference
374         if (u > 0 && b.charAt(u - 1) == '&')
375           {
376             // The subsequent semicolon may be the part of the token
377             // as well. The semicolon must be ignored. This must be
378             // handled elsewhere.
379             return new Token(ENTITY, ENTITY_NAMED, "&" + e,
380                              b.getLocation(u - 1, p + 1)
381                             );
382           }
383 
384         // found the numeric entity reference
385         if (u > 1 && b.charAt(u - 1) == '#' && b.charAt(u - 2) == '&')
386           {
387             // The subsequent semicolon may be the part of the token
388             // as well. The semicolon must be ignored. This must be
389             // handled elsewhere.
390             return new Token(ENTITY, ENTITY_NUMERIC, "&#" + e,
391                              b.getLocation(u - 2, p + 2)
392                             );
393           }
394 
395         Location le = b.getLocation(u, p + 1);
396 
397         if (e.equalsIgnoreCase("SCRIPT"))
398           return new Token(SCRIPT, e, le);
399         else if (e.equalsIgnoreCase("STYLE"))
400           return new Token(STYLE, e, le);
401         else
402           return new Token(NUMTOKEN, e, le);
403       }
404 
405     // Check for whitespace
406     if (bWHITESPACE.get(last) && !bWHITESPACE.get(future))
407       {
408         // Scan the history up:
409         int u = p - 1;
410         while (u >= 0 && bWHITESPACE.get(b.charAt(u)))
411           u--;
412         u++;
413 
414         char[] token = new char[ p - u + 1 ];
415         b.getChars(u, p + 1, token, 0);
416 
417         return new Token(WS, new String(token), b.getLocation(u, p + 1));
418       }
419 
420     return null;
421   }
422 }