Source code: org/apache/xerces/utils/regex/RegularExpression.java
1 /*
2 * The Apache Software License, Version 1.1
3 *
4 *
5 * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
6 * reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution,
21 * if any, must include the following acknowledgment:
22 * "This product includes software developed by the
23 * Apache Software Foundation (http://www.apache.org/)."
24 * Alternately, this acknowledgment may appear in the software itself,
25 * if and wherever such third-party acknowledgments normally appear.
26 *
27 * 4. The names "Xerces" and "Apache Software Foundation" must
28 * not be used to endorse or promote products derived from this
29 * software without prior written permission. For written
30 * permission, please contact apache@apache.org.
31 *
32 * 5. Products derived from this software may not be called "Apache",
33 * nor may "Apache" appear in their name, without prior written
34 * permission of the Apache Software Foundation.
35 *
36 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This software consists of voluntary contributions made by many
51 * individuals on behalf of the Apache Software Foundation and was
52 * originally based on software copyright (c) 1999, International
53 * Business Machines, Inc., http://www.apache.org. For more
54 * information on the Apache Software Foundation, please see
55 * <http://www.apache.org/>.
56 */
57
58 package org.apache.xerces.utils.regex;
59
60
61 import java.text.CharacterIterator;
62
63 /**
64 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
65 * This engine does not conform to the POSIX regular expression.
66 *
67 * <hr width="50%">
68 * <h3>How to use</h3>
69 *
70 * <dl>
71 * <dt>A. Standard way
72 * <dd>
73 * <pre>
74 * RegularExpression re = new RegularExpression(<var>regex</var>);
75 * if (re.matches(text)) { ... }
76 * </pre>
77 *
78 * <dt>B. Capturing groups
79 * <dd>
80 * <pre>
81 * RegularExpression re = new RegularExpression(<var>regex</var>);
82 * Match match = new Match();
83 * if (re.matches(text, match)) {
84 * ... // You can refer captured texts with methods of the <code>Match</code> class.
85 * }
86 * </pre>
87 *
88 * </dl>
89 *
90 * <h4>Case-insensitive matching</h4>
91 * <pre>
92 * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
93 * if (re.matches(text) >= 0) { ...}
94 * </pre>
95 *
96 * <h4>Options</h4>
97 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
98 * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
99 * This <var>options</var> parameter consists of the following characters.
100 * </p>
101 * <dl>
102 * <dt><a name="I_OPTION"><code>"i"</code></a>
103 * <dd>This option indicates case-insensitive matching.
104 * <dt><a name="M_OPTION"><code>"m"</code></a>
105 * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
106 * <dt><a name="S_OPTION"><code>"s"</code></a>
107 * <dd class="REGEX"><kbd>.</kbd> matches any one character.
108 * <dt><a name="U_OPTION"><code>"u"</code></a>
109 * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode.
110 * <dt><a name="W_OPTION"><code>"w"</code></a>
111 * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of
112 * 'Unicode Regular Expression Guidelines' Revision 4.
113 * When "w" and "u" are specified at the same time,
114 * <kbd>\b \B \< \></kbd> are processed for the "w" option.
115 * <dt><a name="COMMA_OPTION"><code>","</code></a>
116 * <dd>The parser treats a comma in a character class as a range separator.
117 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
118 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
119 *
120 * <dt><a name="X_OPTION"><code>"X"</code></a>
121 * <dd class="REGEX">
122 * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
123 * The <code>match()</code> method does not do subsring matching
124 * but entire string matching.
125 *
126 * </dl>
127 *
128 * <hr width="50%">
129 * <h3>Syntax</h3>
130 * <table border="1" bgcolor="#ddeeff">
131 * <tr>
132 * <td>
133 * <h4>Differences from the Perl 5 regular expression</h4>
134 * <ul>
135 * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
136 * <li>Supports subtraction, union, and intersection operations for character classes.
137 * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
138 * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
139 * <kbd>\u005cu</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
140 * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
141 * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
142 * </ul>
143 * </td>
144 * </tr>
145 * </table>
146 *
147 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
148 * <ul>
149 * <li>Character
150 * <dl>
151 * <dt class="REGEX"><kbd>.</kbd> (A period)
152 * <dd>Matches any one character except the following characters.
153 * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
154 * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
155 * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
156 * <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
157 * it matches any character including the above four characters.
158 *
159 * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
160 * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
161 * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
162 *
163 * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
164 * <dd>Matches a control character.
165 * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
166 * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
167 * It matches a control character of which the character code is less than
168 * the character code of the <var>C</var> by 0x0040.
169 * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
170 * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
171 *
172 * <dt class="REGEX">a non-meta character
173 * <dd>Matches the character.
174 *
175 * <dt class="REGEX"><KBD>\</KBD> + a meta character
176 * <dd>Matches the meta character.
177 *
178 * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
179 * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
180 * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
181 * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
182 *
183 * <!--
184 * <dt class="REGEX"><kbd>\u005cu</kbd><var>HHHH</var>
185 * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
186 * -->
187 *
188 * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
189 * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
190 *
191 * <dt class="REGEX"><kbd>\g</kbd>
192 * <dd>Matches a grapheme.
193 * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
194 *
195 * <dt class="REGEX"><kbd>\X</kbd>
196 * <dd class="REGEX">Matches a combining character sequence.
197 * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
198 * </dl>
199 * </li>
200 *
201 * <li>Character class
202 * <dl>
203 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
204 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
205 * <dd>Positive character class. It matches a character in ranges.
206 * <dd><var>R<sub>n</sub></var>:
207 * <ul>
208 * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005cu</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
209 * <p>This range matches the character.
210 * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
211 * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point.
212 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
213 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
214 * <p>...
215 * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
216 * <p>These expressions specifies the same ranges as the following expressions.
217 * </ul>
218 * <p class="REGEX">Enumerated ranges are merged (union operation).
219 * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
220 *
221 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
222 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
223 * <dd>Negative character class. It matches a character not in ranges.
224 *
225 * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
226 * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
227 * <dd>Subtraction or union or intersection for character classes.
228 * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
229 * <dd>The result of this operations is a <u>positive character class</u>
230 * even if an expression includes any negative character classes.
231 * You have to take care on this in case-insensitive matching.
232 * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
233 * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
234 * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
235 * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
236 * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
237 *
238 * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
239 * <dd>Character class subtraction for the XML Schema.
240 * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
241 *
242 * <dt class="REGEX"><kbd>\d</kbd>
243 * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
244 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
245 * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
246 *
247 * <dt class="REGEX"><kbd>\D</kbd>
248 * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
249 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
250 * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
251 *
252 * <dt class="REGEX"><kbd>\s</kbd>
253 * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
254 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
255 * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
256 *
257 * <dt class="REGEX"><kbd>\S</kbd>
258 * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
259 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
260 * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
261 *
262 * <dt class="REGEX"><kbd>\w</kbd>
263 * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
264 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
265 * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
266 *
267 * <dt class="REGEX"><kbd>\W</kbd>
268 * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
269 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
270 * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
271 *
272 * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
273 * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
274 * The following names are available:
275 * <dl>
276 * <dt>Unicode General Categories:
277 * <dd><kbd>
278 * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
279 * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
280 * </kbd>
281 * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
282 * <dt>Unicode Blocks:
283 * <dd><kbd>
284 * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
285 * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
286 * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
287 * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
288 * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
289 * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
290 * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
291 * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
292 * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
293 * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
294 * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
295 * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
296 * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
297 * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
298 * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
299 * Small Form Variants, Arabic Presentation Forms-B, Specials,
300 * Halfwidth and Fullwidth Forms
301 * </kbd>
302 * <dt>Others:
303 * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
304 * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
305 * <dd><kbd>UNASSGINED</kbd>
306 * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
307 * </dl>
308 *
309 * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
310 * <dd>Matches one character not in the specified General Category or the specified Block.
311 * </dl>
312 * </li>
313 *
314 * <li>Selection and Quantifier
315 * <dl>
316 * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
317 * <dd>...
318 *
319 * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
320 * <dd>Matches 0 or more <var>X</var>.
321 *
322 * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
323 * <dd>Matches 1 or more <var>X</var>.
324 *
325 * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
326 * <dd>Matches 0 or 1 <var>X</var>.
327 *
328 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
329 * <dd>Matches <var>number</var> times.
330 *
331 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
332 * <dd>...
333 *
334 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
335 * <dd>...
336 *
337 * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
338 * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
339 * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
340 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
341 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
342 * <dd>Non-greedy matching.
343 * </dl>
344 * </li>
345 *
346 * <li>Grouping, Capturing, and Back-reference
347 * <dl>
348 * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
349 * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
350 * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
351 * you have to write "<KBD>(?:foo)+</KBD>".
352 *
353 * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
354 * <dd>Grouping with capturing.
355 * It make a group and applications can know
356 * where in target text a group matched with methods of a <code>Match</code> instance
357 * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
358 * The 0th group means whole of this regular expression.
359 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
360 *
361 * <p>For instance, a regular expression is
362 * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>"
363 * and target text is
364 * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>":
365 * <ul>
366 * <li><code>Match.getCapturedText(0)</code>:
367 * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>"
368 * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
369 * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
370 * </ul>
371 *
372 * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
373 * <dd>
374 *
375 * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
376 * <dd>Independent expression group. ................
377 *
378 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
379 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
380 * <dd>............................
381 * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
382 * Note that it can not contain 'u'.
383 *
384 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
385 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
386 * <dd>......
387 * <dd>These expressions must be at the beginning of a group.
388 * </dl>
389 * </li>
390 *
391 * <li>Anchor
392 * <dl>
393 * <dt class="REGEX"><kbd>\A</kbd>
394 * <dd>Matches the beginnig of the text.
395 *
396 * <dt class="REGEX"><kbd>\Z</kbd>
397 * <dd>Matches the end of the text, or before an EOL character at the end of the text,
398 * or CARRIAGE RETURN + LINE FEED at the end of the text.
399 *
400 * <dt class="REGEX"><kbd>\z</kbd>
401 * <dd>Matches the end of the text.
402 *
403 * <dt class="REGEX"><kbd>^</kbd>
404 * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
405 * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
406 * it matches the beginning of the text, or after one of EOL characters (
407 * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
408 * PARAGRAPH SEPARATOR (U+2029).)
409 *
410 * <dt class="REGEX"><kbd>$</kbd>
411 * <dd>Matches the end of the text, or before an EOL character at the end of the text,
412 * or CARRIAGE RETURN + LINE FEED at the end of the text.
413 * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
414 * it matches the end of the text, or before an EOL character.
415 *
416 * <dt class="REGEX"><kbd>\b</kbd>
417 * <dd>Matches word boundary.
418 * (See <a href="#W_OPTION">a "w" option</a>)
419 *
420 * <dt class="REGEX"><kbd>\B</kbd>
421 * <dd>Matches non word boundary.
422 * (See <a href="#W_OPTION">a "w" option</a>)
423 *
424 * <dt class="REGEX"><kbd>\<</kbd>
425 * <dd>Matches the beginning of a word.
426 * (See <a href="#W_OPTION">a "w" option</a>)
427 *
428 * <dt class="REGEX"><kbd>\></kbd>
429 * <dd>Matches the end of a word.
430 * (See <a href="#W_OPTION">a "w" option</a>)
431 * </dl>
432 * </li>
433 * <li>Lookahead and lookbehind
434 * <dl>
435 * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
436 * <dd>Lookahead.
437 *
438 * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
439 * <dd>Negative lookahead.
440 *
441 * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd>
442 * <dd>Lookbehind.
443 * <dd>(Note for text capturing......)
444 *
445 * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd>
446 * <dd>Negative lookbehind.
447 * </dl>
448 * </li>
449 *
450 * <li>Misc.
451 * <dl>
452 * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
453 * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
454 * <dd>......
455 * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
456 * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
457 * You can not write comments in character classes and before quantifiers.
458 * </dl>
459 * </li>
460 * </ul>
461 *
462 *
463 * <hr width="50%">
464 * <h3>BNF for the regular expression</h3>
465 * <pre>
466 * regex ::= ('(?' options ')')? term ('|' term)*
467 * term ::= factor+
468 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
469 * | '(?#' [^)]* ')'
470 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
471 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
472 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
473 * | '(?>' regex ')' | '(?' options ':' regex ')'
474 * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
475 * options ::= [imsw]* ('-' [imsw]+)?
476 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
477 * looks ::= '(?=' regex ')' | '(?!' regex ')'
478 * | '(?<=' regex ')' | '(?<!' regex ')'
479 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
480 * category-block ::= '\' [pP] category-symbol-1
481 * | ('\p{' | '\P{') (category-symbol | block-name
482 * | other-properties) '}'
483 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
484 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
485 * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
486 * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
487 * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
488 * | 'Sm' | 'Sc' | 'Sk' | 'So'
489 * block-name ::= (See above)
490 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
491 * character-1 ::= (any character except meta-characters)
492 *
493 * char-class ::= '[' ranges ']'
494 * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
495 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
496 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
497 * | range-char | range-char '-' range-char
498 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
499 * code-point ::= '\x' hex-char hex-char
500 * | '\x{' hex-char+ '}'
501 * <!-- | '\u005cu' hex-char hex-char hex-char hex-char
502 * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
503 * hex-char ::= [0-9a-fA-F]
504 * character-2 ::= (any character except \[]-,)
505 * </pre>
506 *
507 * <hr width="50%">
508 * <h3>TODO</h3>
509 * <ul>
510 * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
511 * <ul>
512 * <li>2.4 Canonical Equivalents
513 * <li>Level 3
514 * </ul>
515 * <li>Parsing performance
516 * </ul>
517 *
518 * <hr width="50%">
519 *
520 * @author TAMURA Kent <kent@trl.ibm.co.jp>
521 */
522 public class RegularExpression implements java.io.Serializable {
523 static final boolean DEBUG = false;
524
525 /**
526 * Compiles a token tree into an operation flow.
527 */
528 private synchronized void compile(Token tok) {
529 if (this.operations != null)
530 return;
531 this.numberOfClosures = 0;
532 this.operations = this.compile(tok, null, false);
533 }
534
535 /**
536 * Converts a token to an operation.
537 */
538 private Op compile(Token tok, Op next, boolean reverse) {
539 Op ret;
540 switch (tok.type) {
541 case Token.DOT:
542 ret = Op.createDot();
543 ret.next = next;
544 break;
545
546 case Token.CHAR:
547 ret = Op.createChar(tok.getChar());
548 ret.next = next;
549 break;
550
551 case Token.ANCHOR:
552 ret = Op.createAnchor(tok.getChar());
553 ret.next = next;
554 break;
555
556 case Token.RANGE:
557 case Token.NRANGE:
558 ret = Op.createRange(tok);
559 ret.next = next;
560 break;
561
562 case Token.CONCAT:
563 ret = next;
564 if (!reverse) {
565 for (int i = tok.size()-1; i >= 0; i --) {
566 ret = compile(tok.getChild(i), ret, false);
567 }
568 } else {
569 for (int i = 0; i < tok.size(); i ++) {
570 ret = compile(tok.getChild(i), ret, true);
571 }
572 }
573 break;
574
575 case Token.UNION:
576 Op.UnionOp uni = Op.createUnion(tok.size());
577 for (int i = 0; i < tok.size(); i ++) {
578 uni.addElement(compile(tok.getChild(i), next, reverse));
579 }
580 ret = uni; // ret.next is null.
581 break;
582
583 case Token.CLOSURE:
584 case Token.NONGREEDYCLOSURE:
585 Token child = tok.getChild(0);
586 int min = tok.getMin();
587 int max = tok.getMax();
588 if (min >= 0 && min == max) { // {n}
589 ret = next;
590 for (int i = 0; i < min; i ++) {
591 ret = compile(child, ret, reverse);
592 }
593 break;
594 }
595 if (min > 0 && max > 0)
596 max -= min;
597 if (max > 0) {
598 // X{2,6} -> XX(X(X(XX?)?)?)?
599 ret = next;
600 for (int i = 0; i < max; i ++) {
601 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
602 q.next = next;
603 q.setChild(compile(child, ret, reverse));
604 ret = q;
605 }
606 } else {
607 Op.ChildOp op;
608 if (tok.type == Token.NONGREEDYCLOSURE) {
609 op = Op.createNonGreedyClosure();
610 } else { // Token.CLOSURE
611 if (child.getMinLength() == 0)
612 op = Op.createClosure(this.numberOfClosures++);
613 else
614 op = Op.createClosure(-1);
615 }
616 op.next = next;
617 op.setChild(compile(child, op, reverse));
618 ret = op;
619 }
620 if (min > 0) {
621 for (int i = 0; i < min; i ++) {
622 ret = compile(child, ret, reverse);
623 }
624 }
625 break;
626
627 case Token.EMPTY:
628 ret = next;
629 break;
630
631 case Token.STRING:
632 ret = Op.createString(tok.getString());
633 ret.next = next;
634 break;
635
636 case Token.BACKREFERENCE:
637 ret = Op.createBackReference(tok.getReferenceNumber());
638 ret.next = next;
639 break;
640
641 case Token.PAREN:
642 if (tok.getParenNumber() == 0) {
643 ret = compile(tok.getChild(0), next, reverse);
644 } else if (reverse) {
645 next = Op.createCapture(tok.getParenNumber(), next);
646 next = compile(tok.getChild(0), next, reverse);
647 ret = Op.createCapture(-tok.getParenNumber(), next);
648 } else {
649 next = Op.createCapture(-tok.getParenNumber(), next);
650 next = compile(tok.getChild(0), next, reverse);
651 ret = Op.createCapture(tok.getParenNumber(), next);
652 }
653 break;
654
655 case Token.LOOKAHEAD:
656 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
657 break;
658 case Token.NEGATIVELOOKAHEAD:
659 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
660 break;
661 case Token.LOOKBEHIND:
662 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
663 break;
664 case Token.NEGATIVELOOKBEHIND:
665 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
666 break;
667
668 case Token.INDEPENDENT:
669 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
670 break;
671
672 case Token.MODIFIERGROUP:
673 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
674 ((Token.ModifierToken)tok).getOptions(),
675 ((Token.ModifierToken)tok).getOptionsMask());
676 break;
677
678 case Token.CONDITION:
679 Token.ConditionToken ctok = (Token.ConditionToken)tok;
680 int ref = ctok.refNumber;
681 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
682 Op yes = compile(ctok.yes, next, reverse);
683 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
684 ret = Op.createCondition(next, ref, condition, yes, no);
685 break;
686
687 default:
688 throw new RuntimeException("Unknown token type: "+tok.type);
689 } // switch (tok.type)
690 return ret;
691 }
692
693
694 //Public
695
696 /**
697 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
698 *
699 * @return true if the target is matched to this regular expression.
700 */
701 public boolean matches(char[] target) {
702 return this.matches(target, 0, target .length , (Match)null);
703 }
704
705 /**
706 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
707 * in specified range or not.
708 *
709 * @param start Start offset of the range.
710 * @param end End offset +1 of the range.
711 * @return true if the target is matched to this regular expression.
712 */
713 public boolean matches(char[] target, int start, int end) {
714 return this.matches(target, start, end, (Match)null);
715 }
716
717 /**
718 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
719 *
720 * @param match A Match instance for storing matching result.
721 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
722 */
723 public boolean matches(char[] target, Match match) {
724 return this.matches(target, 0, target .length , match);
725 }
726
727
728 /**
729 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
730 * in specified range or not.
731 *
732 * @param start Start offset of the range.
733 * @param end End offset +1 of the range.
734 * @param match A Match instance for storing matching result.
735 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
736 */
737 public boolean matches(char[] target, int start, int end, Match match) {
738
739 synchronized (this) {
740 if (this.operations == null)
741 this.prepare();
742 if (this.context == null)
743 this.context = new Context();
744 }
745 Context con = null;
746 synchronized (this.context) {
747 con = this.context.inuse ? new Context() : this.context;
748 con.reset(target, start, end, this.numberOfClosures);
749 }
750 if (match != null) {
751 match.setNumberOfGroups(this.nofparen);
752 match.setSource(target);
753 } else if (this.hasBackReferences) {
754 match = new Match();
755 match.setNumberOfGroups(this.nofparen);
756 // Need not to call setSource() because
757 // a caller can not access this match instance.
758 }
759 con.match = match;
760
761 if (this.isSet(this.options, XMLSCHEMA_MODE)) {
762 int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
763 //System.err.println("DEBUG: matchEnd="+matchEnd);
764 if (matchEnd == con.limit) {
765 if (con.match != null) {
766 con.match.setBeginning(0, con.start);
767 con.match.setEnd(0, matchEnd);
768 }
769 con.inuse = false;
770 return true;
771 }
772 return false;
773 }
774
775 /*
776 * The pattern has only fixed string.
777 * The engine uses Boyer-Moore.
778 */
779 if (this.fixedStringOnly) {
780 //System.err.println("DEBUG: fixed-only: "+this.fixedString);
781 int o = this.fixedStringTable.matches(target, con.start, con.limit);
782 if (o >= 0) {
783 if (con.match != null) {
784 con.match.setBeginning(0, o);
785 con.match.setEnd(0, o+this.fixedString.length());
786 }
787 con.inuse = false;
788 return true;
789 }
790 con.inuse = false;
791 return false;
792 }
793
794 /*
795 * The pattern contains a fixed string.
796 * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
797 * If not, it return with false.
798 */
799 if (this.fixedString != null) {
800 int o = this.fixedStringTable.matches(target, con.start, con.limit);
801 if (o < 0) {
802 //System.err.println("Non-match in fixed-string search.");
803 con.inuse = false;
804 return false;
805 }
806 }
807
808 int limit = con.limit-this.minlength;
809 int matchStart;
810 int matchEnd = -1;
811
812 /*
813 * Checks whether the expression starts with ".*".
814 */
815 if (this.operations != null
816 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
817 if (isSet(this.options, SINGLE_LINE)) {
818 matchStart = con.start;
819 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
820 } else {
821 boolean previousIsEOL = true;
822 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
823 int ch = target [ matchStart ] ;
824 if (isEOLChar(ch)) {
825 previousIsEOL = true;
826 } else {
827 if (previousIsEOL) {
828 if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
829 matchStart, 1, this.options)))
830 break;
831 }
832 previousIsEOL = false;
833 }
834 }
835 }
836 }
837
838 /*
839 * Optimization against the first character.
840 */
841 else if (this.firstChar != null) {
842 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
843 RangeToken range = this.firstChar;
844 if (this.isSet(this.options, IGNORE_CASE)) {
845 range = this.firstChar.getCaseInsensitiveToken();
846 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
847 int ch = target [ matchStart ] ;
848 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
849 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
850 if (!range.match(ch)) continue;
851 } else {
852 if (!range.match(ch)) {
853 char ch1 = Character.toUpperCase((char)ch);
854 if (!range.match(ch1))
855 if (!range.match(Character.toLowerCase(ch1)))
856 continue;
857 }
858 }
859 if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
860 matchStart, 1, this.options)))
861 break;
862 }
863 } else {
864 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
865 int ch = target [ matchStart ] ;
866 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
867 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
868 if (!range.match(ch)) continue;
869 if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
870 matchStart, 1, this.options)))
871 break;
872 }
873 }
874 }
875
876 /*
877 * Straightforward matching.
878 */
879 else {
880 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
881 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
882 break;
883 }
884 }
885
886 if (matchEnd >= 0) {
887 if (con.match != null) {
888 con.match.setBeginning(0, matchStart);
889 con.match.setEnd(0, matchEnd);
890 }
891 con.inuse = false;
892 return true;
893 } else {
894 con.inuse = false;
895 return false;
896 }
897 }
898
899 /**
900 * @return -1 when not match; offset of the end of matched string when match.
901 */
902 private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
903
904 char[] target = con.charTarget;
905
906
907 while (true) {
908 if (op == null)
909 return offset;
910 if (offset > con.limit || offset < con.start)
911 return -1;
912 switch (op.type) {
913 case Op.CHAR:
914 if (isSet(opts, IGNORE_CASE)) {
915 int ch = op.getData();
916 if (dx > 0) {
917 if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] ))
918 return -1;
919 offset ++;
920 } else {
921 int o1 = offset-1;
922 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] ))
923 return -1;
924 offset = o1;
925 }
926 } else {
927 int ch = op.getData();
928 if (dx > 0) {
929 if (offset >= con.limit || ch != target [ offset ] )
930 return -1;
931 offset ++;
932 } else {
933 int o1 = offset-1;
934 if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] )
935 return -1;
936 offset = o1;
937 }
938 }
939 op = op.next;
940 break;
941
942 case Op.DOT:
943 if (dx > 0) {
944 if (offset >= con.limit)
945 return -1;
946 int ch = target [ offset ] ;
947 if (isSet(opts, SINGLE_LINE)) {
948 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
949 offset ++;
950 } else {
951 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
952 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
953 if (isEOLChar(ch))
954 return -1;
955 }
956 offset ++;
957 } else {
958 int o1 = offset-1;
959 if (o1 >= con.limit || o1 < 0)
960 return -1;
961 int ch = target [ o1 ] ;
962 if (isSet(opts, SINGLE_LINE)) {
963 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
964 o1 --;
965 } else {
966 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
967 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
968 if (!isEOLChar(ch))
969 return -1;
970 }
971 offset = o1;
972 }
973 op = op.next;
974 break;
975
976 case Op.RANGE:
977 case Op.NRANGE:
978 if (dx > 0) {
979 if (offset >= con.limit)
980 return -1;
981 int ch = target [ offset ] ;
982 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
983 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
984 RangeToken tok = op.getToken();
985 if (isSet(opts, IGNORE_CASE)) {
986 tok = tok.getCaseInsensitiveToken();
987 if (!tok.match(ch)) {
988 if (ch >= 0x10000) return -1;
989 char uch;
990 if (!tok.match(uch = Character.toUpperCase((char)ch))
991 && !tok.match(Character.toLowerCase(uch)))
992 return -1;
993 }
994 } else {
995 if (!tok.match(ch)) return -1;
996 }
997 offset ++;
998 } else {
999 int o1 = offset-1;
1000 if (o1 >= con.limit || o1 < 0)
1001 return -1;
1002 int ch = target [ o1 ] ;
1003 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1004 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
1005 RangeToken tok = op.getToken();
1006 if (isSet(opts, IGNORE_CASE)) {
1007 tok = tok.getCaseInsensitiveToken();
1008 if (!tok.match(ch)) {
1009 if (ch >= 0x10000) return -1;
1010 char uch;
1011 if (!tok.match(uch = Character.toUpperCase((char)ch))
1012 && !tok.match(Character.toLowerCase(uch)))
1013 return -1;
1014 }
1015 } else {
1016 if (!tok.match(ch)) return -1;
1017 }
1018 offset = o1;
1019 }
1020 op = op.next;
1021 break;
1022
1023 case Op.ANCHOR:
1024 boolean go = false;
1025 switch (op.getData()) {
1026 case '^':
1027 if (isSet(opts, MULTIPLE_LINES)) {
1028 if (!(offset == con.start
1029 || offset > con.start && isEOLChar( target [ offset-1 ] )))
1030 return -1;
1031 } else {
1032 if (offset != con.start)
1033 return -1;
1034 }
1035 break;
1036
1037 case '@': // Internal use only.
1038 // The @ always matches line beginnings.
1039 if (!(offset == con.start
1040 || offset > con.start && isEOLChar( target [ offset-1 ] )))
1041 return -1;
1042 break;
1043
1044 case '$':
1045 if (isSet(opts, MULTIPLE_LINES)) {
1046 if (!(offset == con.limit
1047 || offset < con.limit && isEOLChar( target [ offset ] )))
1048 return -1;
1049 } else {
1050 if (!(offset == con.limit
1051 || offset+1 == con.limit && isEOLChar( target [ offset ] )
1052 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1053 && target [ offset+1 ] == LINE_FEED))
1054 return -1;
1055 }
1056 break;
1057
1058 case 'A':
1059 if (offset != con.start) return -1;
1060 break;
1061
1062 case 'Z':
1063 if (!(offset == con.limit
1064 || offset+1 == con.limit && isEOLChar( target [ offset ] )
1065 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1066 && target [ offset+1 ] == LINE_FEED))
1067 return -1;
1068 break;
1069
1070 case 'z':
1071 if (offset != con.limit) return -1;
1072 break;
1073
1074 case 'b':
1075 if (con.length == 0) return -1;
1076 {
1077 int after = getWordType(target, con.start, con.limit, offset, opts);
1078 if (after == WT_IGNORE) return -1;
1079 int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1080 if (after == before) return -1;
1081 }
1082 break;
1083
1084 case 'B':
1085 if (con.length == 0)
1086 go = true;
1087 else {
1088 int after = getWordType(target, con.start, con.limit, offset, opts);
1089 go = after == WT_IGNORE
1090 || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1091 }
1092 if (!go) return -1;
1093 break;
1094
1095 case '<':
1096 if (con.length == 0 || offset == con.limit) return -1;
1097 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1098 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1099 return -1;
1100 break;
1101
1102 case '>':
1103 if (con.length == 0 || offset == con.start) return -1;
1104 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1105 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1106 return -1;
1107 break;
1108 } // switch anchor type
1109 op = op.next;
1110 break;
1111
1112 case Op.BACKREFERENCE:
1113 {
1114 int refno = op.getData();
1115 if (refno <= 0 || refno >= this.nofparen)
1116 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1117 if (con.match.getBeginning(refno) < 0
1118 || con.match.getEnd(refno) < 0)
1119 return -1; // ********
1120 int o2 = con.match.getBeginning(refno);
1121 int literallen = con.match.getEnd(refno)-o2;
1122 if (!isSet(opts, IGNORE_CASE)) {
1123 if (dx > 0) {
1124 if (!regionMatches(target, offset, con.limit, o2, literallen))
1125 return -1;
1126 offset += literallen;
1127 } else {
1128 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1129 return -1;
1130 offset -= literallen;
1131 }
1132 } else {
1133 if (dx > 0) {
1134 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1135 return -1;
1136 offset += literallen;
1137 } else {
1138 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1139 o2, literallen))
1140 return -1;
1141 offset -= literallen;
1142 }
1143 }
1144 }
1145 op = op.next;
1146 break;
1147 case Op.STRING:
1148 {
1149 String literal = op.getString();
1150 int literallen = literal.length();
1151 if (!isSet(opts, IGNORE_CASE)) {
1152 if (dx > 0) {
1153 if (!regionMatches(target, offset, con.limit, literal, literallen))
1154 return -1;
1155 offset += literallen;
1156 } else {
1157 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1158 return -1;
1159 offset -= literallen;
1160 }
1161 } else {
1162 if (dx > 0) {
1163 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1164 return -1;
1165 offset += literallen;
1166 } else {
1167 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1168 literal, literallen))
1169 return -1;
1170 offset -= literallen;
1171 }
1172 }
1173 }
1174 op = op.next;
1175 break;
1176
1177 case Op.CLOSURE:
1178 {
1179 /*
1180 * Saves current position to avoid
1181 * zero-width repeats.
1182 */
1183 int id = op.getData();
1184 if (id >= 0) {
1185 int previousOffset = con.offsets[id];
1186 if (previousOffset < 0 || previousOffset != offset) {
1187 con.offsets[id] = offset;
1188 } else {
1189 con.offsets[id] = -1;
1190 op = op.next;
1191 break;
1192 }
1193 }
1194
1195 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1196 if (id >= 0) con.offsets[id] = -1;
1197 if (ret >= 0) return ret;
1198 op = op.next;
1199 }
1200 break;
1201
1202 case Op.QUESTION:
1203 {
1204 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1205 if (ret >= 0) return ret;
1206 op = op.next;
1207 }
1208 break;
1209
1210 case Op.NONGREEDYCLOSURE:
1211 case Op.NONGREEDYQUESTION:
1212 {
1213 int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1214 if (ret >= 0) return ret;
1215 op = op.getChild();
1216 }
1217 break;
1218
1219 case Op.UNION:
1220 for (int i = 0; i < op.size(); i ++) {
1221 int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1222 //System.err.println("UNION: "+i+", ret="+ret);
1223 if (ret >= 0) return ret;
1224 }
1225 return -1;
1226
1227 case Op.CAPTURE:
1228 int refno = op.getData();
1229 if (con.match != null && refno > 0) {
1230 int save = con.match.getBeginning(refno);
1231 con.match.setBeginning(refno, offset);
1232 int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1233