Source code: gnu/javax/swing/text/html/parser/support/low/Constants.java
1 /* Constants.java --
2 Copyright (C) 2005 Free Software Foundation, Inc.
3
4 This file is part of GNU Classpath.
5
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
37
38
39 package gnu.javax.swing.text.html.parser.support.low;
40
41 import java.util.BitSet;
42
43 /**
44 * The parser constants and operations, directly related to the parser
45 * constants.
46 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
47 */
48 public class Constants
49 {
50 /* Single character tokens are reflected into they ASCII codes. */
51
52 /**
53 * Start of HTML token.
54 */
55 public static final int BEGIN = '<';
56
57 /**
58 * End of HTML token.
59 */
60 public static final int END = '>';
61
62 /**
63 * Exclamation (indicates SGML or comment).
64 */
65 public static final int EXCLAMATION = '!';
66
67 /**
68 * Slash (indicates closing tag).
69 */
70 public static final int SLASH = '/';
71
72 /**
73 * Equals sign.
74 */
75 public static final int EQ = '=';
76
77 /**
78 * Quoting sign.
79 */
80 public static final int AP = '\'';
81
82 /**
83 * Quoting sign.
84 */
85 public static final int QUOT = '"';
86
87 /* The numbers of other tokens start outside the ascii space. */
88 /* String tokens */
89
90 /**
91 * Double dash (--)
92 */
93 public static final int DOUBLE_DASH = 1000;
94
95 /**
96 * The STYLE tag (needs special handling).
97 */
98 public static final int STYLE = 1001;
99
100 /**
101 * The SCRIPT tag (needs special handling).
102 */
103 public static final int SCRIPT = 1002;
104
105 /* Pattern tokens */
106
107 /**
108 * HTML whitespace.
109 */
110 public static final int WS = 1003;
111
112 /**
113 * Named or numeric entity,
114 */
115 public static final int ENTITY = 1004;
116
117 /**
118 * Sequence of valid name characters (can start from digit).
119 */
120 public static final int NUMTOKEN = 1005;
121
122 /* Complex tokens */
123
124 /**
125 * Comment opening sequence.
126 */
127 public static final pattern COMMENT_OPEN =
128 new pattern(new node[]
129 {
130 new node(BEGIN), new node(WS, true), new node(EXCLAMATION),
131 new node(WS, true), new node(DOUBLE_DASH),
132 }
133 );
134
135 /**
136 * Comment closing sequence
137 */
138 public static final pattern COMMENT_END =
139 new pattern(new node[]
140 {
141 new node(DOUBLE_DASH), new node(WS, true), new node(END)
142 }
143 );
144
145 /**
146 * Special case ---> (also is treated as end of comment).
147 */
148 public static final pattern COMMENT_TRIPLEDASH_END =
149 new pattern(new node[]
150 {
151 new node(DOUBLE_DASH), new node(NUMTOKEN), new node(END)
152 }
153 );
154
155 /**
156 * STYLE element heading pattern.
157 */
158 public static final pattern STYLE_OPEN =
159 new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(STYLE) });
160
161 /**
162 * SCRIPT element heading pattern.
163 */
164 public static final pattern SCRIPT_OPEN =
165 new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(SCRIPT) });
166
167 /**
168 * SGML element heading pattern.
169 */
170 public static final pattern SGML =
171 new pattern(new node[]
172 {
173 new node(BEGIN), new node(WS, true), new node(EXCLAMATION)
174 }
175 );
176
177 /**
178 * SCRIPT element closing pattern.
179 */
180 public static final pattern SCRIPT_CLOSE =
181 new pattern(new node[]
182 {
183 new node(BEGIN), new node(WS, true), new node(SLASH),
184 new node(WS, true), new node(SCRIPT), new node(WS, true),
185 new node(END)
186 }
187 );
188
189 /**
190 * STYLE element closing pattern.
191 */
192 public static final pattern STYLE_CLOSE =
193 new pattern(new node[]
194 {
195 new node(BEGIN), new node(WS, true), new node(SLASH),
196 new node(WS, true), new node(STYLE), new node(WS, true),
197 new node(END)
198 }
199 );
200
201 /**
202 * Ordinary HTML tag heading pattern.
203 */
204 public static final pattern TAG =
205 new pattern(new node[]
206 {
207 new node(BEGIN), new node(WS, true), new node(SLASH, true),
208 new node(WS, true), new node(NUMTOKEN)
209 }
210 );
211
212 /* Special tokens */
213
214 /**
215 * All other tokens.
216 */
217 public static final int OTHER = 1999;
218
219 /**
220 * The UNICODE "end of text" control code
221 */
222 static final char ETX = 3;
223
224 /**
225 * End of file.
226 */
227 public static final int EOF = ETX;
228
229 /* Character categories */
230
231 /**
232 * All single char tokens.
233 */
234 public static final BitSet bSINGLE_CHAR_TOKEN = new BitSet();
235
236 /**
237 * Non letters and non numbers, allowed in HTML names.
238 */
239 public static final BitSet bSPECIAL = new BitSet();
240
241 /**
242 * All letters, used in HTML names.
243 */
244 public static final BitSet bLETTER = new BitSet();
245
246 /**
247 * Digits.
248 */
249 public static final BitSet bDIGIT = new BitSet();
250
251 /**
252 * Both line breaks.
253 */
254 public static final BitSet bLINEBREAK = new BitSet();
255
256 /**
257 * All whitespace.
258 */
259 public static final BitSet bWHITESPACE = new BitSet();
260
261 /**
262 * Both quoting characters.
263 */
264 public static final BitSet bQUOTING = new BitSet();
265
266 /**
267 * Valid name characters.
268 */
269 public static final BitSet bNAME = new BitSet();
270
271 /* Entity subcategories */
272
273 /**
274 * Named entity.
275 */
276 public static final int ENTITY_NAMED = 1;
277
278 /**
279 * Numeric entity.
280 */
281 public static final int ENTITY_NUMERIC = 2;
282
283 static
284 {
285 bQUOTING.set(AP);
286 bQUOTING.set(QUOT);
287
288 bSINGLE_CHAR_TOKEN.set(BEGIN);
289 bSINGLE_CHAR_TOKEN.set(END);
290 bSINGLE_CHAR_TOKEN.set(EXCLAMATION);
291 bSINGLE_CHAR_TOKEN.set(SLASH);
292 bSINGLE_CHAR_TOKEN.set(EQ);
293 bSINGLE_CHAR_TOKEN.set(EOF);
294
295 bSINGLE_CHAR_TOKEN.or(bQUOTING);
296
297 bLINEBREAK.set('\r');
298 bLINEBREAK.set('\n');
299
300 bWHITESPACE.set(' ');
301 bWHITESPACE.set('\t');
302 bWHITESPACE.set(0xC);
303 bWHITESPACE.or(bLINEBREAK);
304
305 for (char i = '0'; i <= '9'; i++)
306 {
307 bDIGIT.set(i);
308 }
309
310 for (char i = 'a'; i <= 'z'; i++)
311 {
312 bLETTER.set(i);
313 }
314
315 for (char i = 'A'; i <= 'Z'; i++)
316 {
317 bLETTER.set(i);
318 }
319
320 bSPECIAL.set('-');
321 bSPECIAL.set('_');
322 bSPECIAL.set(':');
323 bSPECIAL.set('.');
324
325 bNAME.or(bLETTER);
326 bNAME.or(bDIGIT);
327 bNAME.or(bSPECIAL);
328 }
329
330 /**
331 * Verifies if one of the tokens matches the end of string
332 * buffer. The last character in the string buffer is the
333 * "future character", some tokens needs to verify it the
334 * token does not continue "towards the future". If the token
335 * matches, it matches till "pre-last" character in the buffer.
336 * @param b
337 * @return
338 */
339 public Token endMatches(Buffer b)
340 {
341 if (b.length() < 2)
342 return null;
343
344 int p = b.length() - 2;
345
346 if (b.length() > 2 && b.charAt(p) == '-' && b.charAt(p - 1) == '-')
347 return new Token(DOUBLE_DASH, "--", b.getLocation(p - 1, p + 1));
348
349 char last = b.charAt(p);
350
351 if (bSINGLE_CHAR_TOKEN.get(last))
352 return new Token(last, last, b.getLocation(p, p + 1));
353
354 char future = b.charAt(p + 1);
355
356 // Check for numtokens, script and style:
357 if (bNAME.get(last) && !bNAME.get(future))
358 {
359 // Scan the history up:
360 int u = p - 1;
361 while (u >= 0 && bNAME.get(b.charAt(u)))
362 u--;
363 u++;
364
365 char[] token = new char[ p - u + 1 ];
366
367 // Found a numtoken
368 b.getChars(u, p + 1, token, 0);
369
370 // Verify for the built-in tokens:
371 String e = new String(token);
372
373 // found the entity reference
374 if (u > 0 && b.charAt(u - 1) == '&')
375 {
376 // The subsequent semicolon may be the part of the token
377 // as well. The semicolon must be ignored. This must be
378 // handled elsewhere.
379 return new Token(ENTITY, ENTITY_NAMED, "&" + e,
380 b.getLocation(u - 1, p + 1)
381 );
382 }
383
384 // found the numeric entity reference
385 if (u > 1 && b.charAt(u - 1) == '#' && b.charAt(u - 2) == '&')
386 {
387 // The subsequent semicolon may be the part of the token
388 // as well. The semicolon must be ignored. This must be
389 // handled elsewhere.
390 return new Token(ENTITY, ENTITY_NUMERIC, "&#" + e,
391 b.getLocation(u - 2, p + 2)
392 );
393 }
394
395 Location le = b.getLocation(u, p + 1);
396
397 if (e.equalsIgnoreCase("SCRIPT"))
398 return new Token(SCRIPT, e, le);
399 else if (e.equalsIgnoreCase("STYLE"))
400 return new Token(STYLE, e, le);
401 else
402 return new Token(NUMTOKEN, e, le);
403 }
404
405 // Check for whitespace
406 if (bWHITESPACE.get(last) && !bWHITESPACE.get(future))
407 {
408 // Scan the history up:
409 int u = p - 1;
410 while (u >= 0 && bWHITESPACE.get(b.charAt(u)))
411 u--;
412 u++;
413
414 char[] token = new char[ p - u + 1 ];
415 b.getChars(u, p + 1, token, 0);
416
417 return new Token(WS, new String(token), b.getLocation(u, p + 1));
418 }
419
420 return null;
421 }
422 }