Source code: org/apache/xmlbeans/impl/regex/RegularExpression.java
1 /* Copyright 2004 The Apache Software Foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package org.apache.xmlbeans.impl.regex;
17
18 import java.text.CharacterIterator;
19
20 /**
21 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
22 * This engine does not conform to the POSIX regular expression.
23 *
24 * <hr width="50%">
25 * <h3>How to use</h3>
26 *
27 * <dl>
28 * <dt>A. Standard way
29 * <dd>
30 * <pre>
31 * RegularExpression re = new RegularExpression(<var>regex</var>);
32 * if (re.matches(text)) { ... }
33 * </pre>
34 *
35 * <dt>B. Capturing groups
36 * <dd>
37 * <pre>
38 * RegularExpression re = new RegularExpression(<var>regex</var>);
39 * Match match = new Match();
40 * if (re.matches(text, match)) {
41 * ... // You can refer captured texts with methods of the <code>Match</code> class.
42 * }
43 * </pre>
44 *
45 * </dl>
46 *
47 * <h4>Case-insensitive matching</h4>
48 * <pre>
49 * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
50 * if (re.matches(text) >= 0) { ...}
51 * </pre>
52 *
53 * <h4>Options</h4>
54 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
55 * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
56 * This <var>options</var> parameter consists of the following characters.
57 * </p>
58 * <dl>
59 * <dt><a name="I_OPTION"><code>"i"</code></a>
60 * <dd>This option indicates case-insensitive matching.
61 * <dt><a name="M_OPTION"><code>"m"</code></a>
62 * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
63 * <dt><a name="S_OPTION"><code>"s"</code></a>
64 * <dd class="REGEX"><kbd>.</kbd> matches any one character.
65 * <dt><a name="U_OPTION"><code>"u"</code></a>
66 * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode.
67 * <dt><a name="W_OPTION"><code>"w"</code></a>
68 * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of
69 * 'Unicode Regular Expression Guidelines' Revision 4.
70 * When "w" and "u" are specified at the same time,
71 * <kbd>\b \B \< \></kbd> are processed for the "w" option.
72 * <dt><a name="COMMA_OPTION"><code>","</code></a>
73 * <dd>The parser treats a comma in a character class as a range separator.
74 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
75 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
76 *
77 * <dt><a name="X_OPTION"><code>"X"</code></a>
78 * <dd class="REGEX">
79 * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
80 * The <code>match()</code> method does not do subsring matching
81 * but entire string matching.
82 *
83 * </dl>
84 *
85 * <hr width="50%">
86 * <h3>Syntax</h3>
87 * <table border="1" bgcolor="#ddeeff">
88 * <tr>
89 * <td>
90 * <h4>Differences from the Perl 5 regular expression</h4>
91 * <ul>
92 * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
93 * <li>Supports subtraction, union, and intersection operations for character classes.
94 * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
95 * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
96 * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
97 * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
98 * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
99 * </ul>
100 * </td>
101 * </tr>
102 * </table>
103 *
104 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
105 * <ul>
106 * <li>Character
107 * <dl>
108 * <dt class="REGEX"><kbd>.</kbd> (A period)
109 * <dd>Matches any one character except the following characters.
110 * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
111 * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
112 * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
113 * <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
114 * it matches any character including the above four characters.
115 *
116 * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
117 * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
118 * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
119 *
120 * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
121 * <dd>Matches a control character.
122 * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
123 * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
124 * It matches a control character of which the character code is less than
125 * the character code of the <var>C</var> by 0x0040.
126 * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
127 * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
128 *
129 * <dt class="REGEX">a non-meta character
130 * <dd>Matches the character.
131 *
132 * <dt class="REGEX"><KBD>\</KBD> + a meta character
133 * <dd>Matches the meta character.
134 *
135 * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
136 * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
137 * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
138 * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
139 *
140 * <!--
141 * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
142 * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
143 * -->
144 *
145 * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
146 * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
147 *
148 * <dt class="REGEX"><kbd>\g</kbd>
149 * <dd>Matches a grapheme.
150 * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
151 *
152 * <dt class="REGEX"><kbd>\X</kbd>
153 * <dd class="REGEX">Matches a combining character sequence.
154 * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
155 * </dl>
156 * </li>
157 *
158 * <li>Character class
159 * <dl>
160 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
161 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
162 * <dd>Positive character class. It matches a character in ranges.
163 * <dd><var>R<sub>n</sub></var>:
164 * <ul>
165 * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
166 * <p>This range matches the character.
167 * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
168 * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point.
169 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
170 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
171 * <p>...
172 * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
173 * <p>These expressions specifies the same ranges as the following expressions.
174 * </ul>
175 * <p class="REGEX">Enumerated ranges are merged (union operation).
176 * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
177 *
178 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
179 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
180 * <dd>Negative character class. It matches a character not in ranges.
181 *
182 * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
183 * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
184 * <dd>Subtraction or union or intersection for character classes.
185 * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
186 * <dd>The result of this operations is a <u>positive character class</u>
187 * even if an expression includes any negative character classes.
188 * You have to take care on this in case-insensitive matching.
189 * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
190 * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
191 * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
192 * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
193 * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
194 *
195 * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
196 * <dd>Character class subtraction for the XML Schema.
197 * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
198 *
199 * <dt class="REGEX"><kbd>\d</kbd>
200 * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
201 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
202 * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
203 *
204 * <dt class="REGEX"><kbd>\D</kbd>
205 * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
206 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
207 * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
208 *
209 * <dt class="REGEX"><kbd>\s</kbd>
210 * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
211 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
212 * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
213 *
214 * <dt class="REGEX"><kbd>\S</kbd>
215 * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
216 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
217 * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
218 *
219 * <dt class="REGEX"><kbd>\w</kbd>
220 * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
221 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
222 * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
223 *
224 * <dt class="REGEX"><kbd>\W</kbd>
225 * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
226 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
227 * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
228 *
229 * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
230 * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
231 * The following names are available:
232 * <dl>
233 * <dt>Unicode General Categories:
234 * <dd><kbd>
235 * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
236 * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
237 * </kbd>
238 * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
239 * <dt>Unicode Blocks:
240 * <dd><kbd>
241 * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
242 * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
243 * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
244 * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
245 * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
246 * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
247 * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
248 * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
249 * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
250 * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
251 * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
252 * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
253 * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
254 * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
255 * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
256 * Small Form Variants, Arabic Presentation Forms-B, Specials,
257 * Halfwidth and Fullwidth Forms
258 * </kbd>
259 * <dt>Others:
260 * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
261 * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
262 * <dd><kbd>UNASSGINED</kbd>
263 * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
264 * </dl>
265 *
266 * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
267 * <dd>Matches one character not in the specified General Category or the specified Block.
268 * </dl>
269 * </li>
270 *
271 * <li>Selection and Quantifier
272 * <dl>
273 * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
274 * <dd>...
275 *
276 * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
277 * <dd>Matches 0 or more <var>X</var>.
278 *
279 * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
280 * <dd>Matches 1 or more <var>X</var>.
281 *
282 * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
283 * <dd>Matches 0 or 1 <var>X</var>.
284 *
285 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
286 * <dd>Matches <var>number</var> times.
287 *
288 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
289 * <dd>...
290 *
291 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
292 * <dd>...
293 *
294 * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
295 * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
296 * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
297 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
298 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
299 * <dd>Non-greedy matching.
300 * </dl>
301 * </li>
302 *
303 * <li>Grouping, Capturing, and Back-reference
304 * <dl>
305 * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
306 * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
307 * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
308 * you have to write "<KBD>(?:foo)+</KBD>".
309 *
310 * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
311 * <dd>Grouping with capturing.
312 * It make a group and applications can know
313 * where in target text a group matched with methods of a <code>Match</code> instance
314 * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
315 * The 0th group means whole of this regular expression.
316 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
317 *
318 * <p>For instance, a regular expression is
319 * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>"
320 * and target text is
321 * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>":
322 * <ul>
323 * <li><code>Match.getCapturedText(0)</code>:
324 * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>"
325 * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
326 * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
327 * </ul>
328 *
329 * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
330 * <dd>
331 *
332 * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
333 * <dd>Independent expression group. ................
334 *
335 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
336 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
337 * <dd>............................
338 * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
339 * Note that it can not contain 'u'.
340 *
341 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
342 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
343 * <dd>......
344 * <dd>These expressions must be at the beginning of a group.
345 * </dl>
346 * </li>
347 *
348 * <li>Anchor
349 * <dl>
350 * <dt class="REGEX"><kbd>\A</kbd>
351 * <dd>Matches the beginnig of the text.
352 *
353 * <dt class="REGEX"><kbd>\Z</kbd>
354 * <dd>Matches the end of the text, or before an EOL character at the end of the text,
355 * or CARRIAGE RETURN + LINE FEED at the end of the text.
356 *
357 * <dt class="REGEX"><kbd>\z</kbd>
358 * <dd>Matches the end of the text.
359 *
360 * <dt class="REGEX"><kbd>^</kbd>
361 * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
362 * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
363 * it matches the beginning of the text, or after one of EOL characters (
364 * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
365 * PARAGRAPH SEPARATOR (U+2029).)
366 *
367 * <dt class="REGEX"><kbd>$</kbd>
368 * <dd>Matches the end of the text, or before an EOL character at the end of the text,
369 * or CARRIAGE RETURN + LINE FEED at the end of the text.
370 * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
371 * it matches the end of the text, or before an EOL character.
372 *
373 * <dt class="REGEX"><kbd>\b</kbd>
374 * <dd>Matches word boundary.
375 * (See <a href="#W_OPTION">a "w" option</a>)
376 *
377 * <dt class="REGEX"><kbd>\B</kbd>
378 * <dd>Matches non word boundary.
379 * (See <a href="#W_OPTION">a "w" option</a>)
380 *
381 * <dt class="REGEX"><kbd>\<</kbd>
382 * <dd>Matches the beginning of a word.
383 * (See <a href="#W_OPTION">a "w" option</a>)
384 *
385 * <dt class="REGEX"><kbd>\></kbd>
386 * <dd>Matches the end of a word.
387 * (See <a href="#W_OPTION">a "w" option</a>)
388 * </dl>
389 * </li>
390 * <li>Lookahead and lookbehind
391 * <dl>
392 * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
393 * <dd>Lookahead.
394 *
395 * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
396 * <dd>Negative lookahead.
397 *
398 * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd>
399 * <dd>Lookbehind.
400 * <dd>(Note for text capturing......)
401 *
402 * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd>
403 * <dd>Negative lookbehind.
404 * </dl>
405 * </li>
406 *
407 * <li>Misc.
408 * <dl>
409 * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
410 * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
411 * <dd>......
412 * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
413 * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
414 * You can not write comments in character classes and before quantifiers.
415 * </dl>
416 * </li>
417 * </ul>
418 *
419 *
420 * <hr width="50%">
421 * <h3>BNF for the regular expression</h3>
422 * <pre>
423 * regex ::= ('(?' options ')')? term ('|' term)*
424 * term ::= factor+
425 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
426 * | '(?#' [^)]* ')'
427 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
428 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
429 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
430 * | '(?>' regex ')' | '(?' options ':' regex ')'
431 * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
432 * options ::= [imsw]* ('-' [imsw]+)?
433 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
434 * looks ::= '(?=' regex ')' | '(?!' regex ')'
435 * | '(?<=' regex ')' | '(?<!' regex ')'
436 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
437 * category-block ::= '\' [pP] category-symbol-1
438 * | ('\p{' | '\P{') (category-symbol | block-name
439 * | other-properties) '}'
440 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
441 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
442 * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
443 * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
444 * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
445 * | 'Sm' | 'Sc' | 'Sk' | 'So'
446 * block-name ::= (See above)
447 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
448 * character-1 ::= (any character except meta-characters)
449 *
450 * char-class ::= '[' ranges ']'
451 * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
452 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
453 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
454 * | range-char | range-char '-' range-char
455 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
456 * code-point ::= '\x' hex-char hex-char
457 * | '\x{' hex-char+ '}'
458 * <!-- | '\u005c u' hex-char hex-char hex-char hex-char
459 * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
460 * hex-char ::= [0-9a-fA-F]
461 * character-2 ::= (any character except \[]-,)
462 * </pre>
463 *
464 * <hr width="50%">
465 * <h3>TODO</h3>
466 * <ul>
467 * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
468 * <ul>
469 * <li>2.4 Canonical Equivalents
470 * <li>Level 3
471 * </ul>
472 * <li>Parsing performance
473 * </ul>
474 *
475 * <hr width="50%">
476 *
477 * @author TAMURA Kent <kent@trl.ibm.co.jp>
478 * @version $Id: RegularExpression.java 111285 2004-12-08 16:54:26Z cezar $
479 */
480 public class RegularExpression implements java.io.Serializable {
481 static final boolean DEBUG = false;
482
483 /**
484 * Compiles a token tree into an operation flow.
485 */
486 private synchronized void compile(Token tok) {
487 if (this.operations != null)
488 return;
489 this.numberOfClosures = 0;
490 this.operations = this.compile(tok, null, false);
491 }
492
493 /**
494 * Converts a token to an operation.
495 */
496 private Op compile(Token tok, Op next, boolean reverse) {
497 Op ret;
498 switch (tok.type) {
499 case Token.DOT:
500 ret = Op.createDot();
501 ret.next = next;
502 break;
503
504 case Token.CHAR:
505 ret = Op.createChar(tok.getChar());
506 ret.next = next;
507 break;
508
509 case Token.ANCHOR:
510 ret = Op.createAnchor(tok.getChar());
511 ret.next = next;
512 break;
513
514 case Token.RANGE:
515 case Token.NRANGE:
516 ret = Op.createRange(tok);
517 ret.next = next;
518 break;
519
520 case Token.CONCAT:
521 ret = next;
522 if (!reverse) {
523 for (int i = tok.size()-1; i >= 0; i --) {
524 ret = compile(tok.getChild(i), ret, false);
525 }
526 } else {
527 for (int i = 0; i < tok.size(); i ++) {
528 ret = compile(tok.getChild(i), ret, true);
529 }
530 }
531 break;
532
533 case Token.UNION:
534 Op.UnionOp uni = Op.createUnion(tok.size());
535 for (int i = 0; i < tok.size(); i ++) {
536 uni.addElement(compile(tok.getChild(i), next, reverse));
537 }
538 ret = uni; // ret.next is null.
539 break;
540
541 case Token.CLOSURE:
542 case Token.NONGREEDYCLOSURE:
543 Token child = tok.getChild(0);
544 int min = tok.getMin();
545 int max = tok.getMax();
546 if (min >= 0 && min == max) { // {n}
547 ret = next;
548 for (int i = 0; i < min; i ++) {
549 ret = compile(child, ret, reverse);
550 }
551 break;
552 }
553 if (min > 0 && max > 0)
554 max -= min;
555 if (max > 0) {
556 // X{2,6} -> XX(X(X(XX?)?)?)?
557 ret = next;
558 for (int i = 0; i < max; i ++) {
559 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
560 q.next = next;
561 q.setChild(compile(child, ret, reverse));
562 ret = q;
563 }
564 } else {
565 Op.ChildOp op;
566 if (tok.type == Token.NONGREEDYCLOSURE) {
567 op = Op.createNonGreedyClosure();
568 } else { // Token.CLOSURE
569 if (child.getMinLength() == 0)
570 op = Op.createClosure(this.numberOfClosures++);
571 else
572 op = Op.createClosure(-1);
573 }
574 op.next = next;
575 op.setChild(compile(child, op, reverse));
576 ret = op;
577 }
578 if (min > 0) {
579 for (int i = 0; i < min; i ++) {
580 ret = compile(child, ret, reverse);
581 }
582 }
583 break;
584
585 case Token.EMPTY:
586 ret = next;
587 break;
588
589 case Token.STRING:
590 ret = Op.createString(tok.getString());
591 ret.next = next;
592 break;
593
594 case Token.BACKREFERENCE:
595 ret = Op.createBackReference(tok.getReferenceNumber());
596 ret.next = next;
597 break;
598
599 case Token.PAREN:
600 if (tok.getParenNumber() == 0) {
601 ret = compile(tok.getChild(0), next, reverse);
602 } else if (reverse) {
603 next = Op.createCapture(tok.getParenNumber(), next);
604 next = compile(tok.getChild(0), next, reverse);
605 ret = Op.createCapture(-tok.getParenNumber(), next);
606 } else {
607 next = Op.createCapture(-tok.getParenNumber(), next);
608 next = compile(tok.getChild(0), next, reverse);
609 ret = Op.createCapture(tok.getParenNumber(), next);
610 }
611 break;
612
613 case Token.LOOKAHEAD:
614 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
615 break;
616 case Token.NEGATIVELOOKAHEAD:
617 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
618 break;
619 case Token.LOOKBEHIND:
620 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
621 break;
622 case Token.NEGATIVELOOKBEHIND:
623 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
624 break;
625
626 case Token.INDEPENDENT:
627 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
628 break;
629
630 case Token.MODIFIERGROUP:
631 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
632 ((Token.ModifierToken)tok).getOptions(),
633 ((Token.ModifierToken)tok).getOptionsMask());
634 break;
635
636 case Token.CONDITION:
637 Token.ConditionToken ctok = (Token.ConditionToken)tok;
638 int ref = ctok.refNumber;
639 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
640 Op yes = compile(ctok.yes, next, reverse);
641 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
642 ret = Op.createCondition(next, ref, condition, yes, no);
643 break;
644
645 default:
646 throw new RuntimeException("Unknown token type: "+tok.type);
647 } // switch (tok.type)
648 return ret;
649 }
650
651
652 //Public
653
654 /**
655 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
656 *
657 * @return true if the target is matched to this regular expression.
658 */
659 public boolean matches(char[] target) {
660 return this.matches(target, 0, target .length , (Match)null);
661 }
662
663 /**
664 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
665 * in specified range or not.
666 *
667 * @param start Start offset of the range.
668 * @param end End offset +1 of the range.
669 * @return true if the target is matched to this regular expression.
670 */
671 public boolean matches(char[] target, int start, int end) {
672 return this.matches(target, start, end, (Match)null);
673 }
674
675 /**
676 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
677 *
678 * @param match A Match instance for storing matching result.
679 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
680 */
681 public boolean matches(char[] target, Match match) {
682 return this.matches(target, 0, target .length , match);
683 }
684
685
686 /**
687 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
688 * in specified range or not.
689 *
690 * @param start Start offset of the range.
691 * @param end End offset +1 of the range.
692 * @param match A Match instance for storing matching result.
693 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
694 */
695 public boolean matches(char[] target, int start, int end, Match match) {
696
697 synchronized (this) {
698 if (this.operations == null)
699 this.prepare();
700 if (this.context == null)
701 this.context = new Context();
702 }
703 Context con = null;
704 synchronized (this.context) {
705 con = this.context.inuse ? new Context() : this.context;
706 con.reset(target, start, end, this.numberOfClosures);
707 }
708 if (match != null) {
709 match.setNumberOfGroups(this.nofparen);
710 match.setSource(target);
711 } else if (this.hasBackReferences) {
712 match = new Match();
713 match.setNumberOfGroups(this.nofparen);
714 // Need not to call setSource() because
715 // a caller can not access this match instance.
716 }
717 con.match = match;
718
719 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
720 int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
721 //System.err.println("DEBUG: matchEnd="+matchEnd);
722 if (matchEnd == con.limit) {
723 if (con.match != null) {
724 con.match.setBeginning(0, con.start);
725 con.match.setEnd(0, matchEnd);
726 }
727 con.inuse = false;
728 return true;
729 }
730 return false;
731 }
732
733 /*
734 * The pattern has only fixed string.
735 * The engine uses Boyer-Moore.
736 */
737 if (this.fixedStringOnly) {
738 //System.err.println("DEBUG: fixed-only: "+this.fixedString);
739 int o = this.fixedStringTable.matches(target, con.start, con.limit);
740 if (o >= 0) {
741 if (con.match != null) {
742 con.match.setBeginning(0, o);
743 con.match.setEnd(0, o+this.fixedString.length());
744 }
745 con.inuse = false;
746 return true;
747 }
748 con.inuse = false;
749 return false;
750 }
751
752 /*
753 * The pattern contains a fixed string.
754 * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
755 * If not, it return with false.
756 */
757 if (this.fixedString != null) {
758 int o = this.fixedStringTable.matches(target, con.start, con.limit);
759 if (o < 0) {
760 //System.err.println("Non-match in fixed-string search.");
761 con.inuse = false;
762 return false;
763 }
764 }
765
766 int limit = con.limit-this.minlength;
767 int matchStart;
768 int matchEnd = -1;
769
770 /*
771 * Checks whether the expression starts with ".*".
772 */
773 if (this.operations != null
774 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
775 if (isSet(this.options, SINGLE_LINE)) {
776 matchStart = con.start;
777 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
778 } else {
779 boolean previousIsEOL = true;
780 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
781 int ch = target [ matchStart ] ;
782 if (isEOLChar(ch)) {
783 previousIsEOL = true;
784 } else {
785 if (previousIsEOL) {
786 if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
787 matchStart, 1, this.options)))
788 break;
789 }
790 previousIsEOL = false;
791 }
792 }
793 }
794 }
795
796 /*
797 * Optimization against the first character.
798 */
799 else if (this.firstChar != null) {
800 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
801 RangeToken range = this.firstChar;
802 if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
803 range = this.firstChar.getCaseInsensitiveToken();
804 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
805 int ch = target [ matchStart ] ;
806 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
807 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
808 if (!range.match(ch)) continue;
809 } else {
810 if (!range.match(ch)) {
811 char ch1 = Character.toUpperCase((char)ch);
812 if (!range.match(ch1))
813 if (!range.match(Character.toLowerCase(ch1)))
814 continue;
815 }
816 }
817 if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
818 matchStart, 1, this.options)))
819 break;
820 }
821 } else {
822 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
823 int ch = target [ matchStart ] ;
824 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
825 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
826 if (!range.match(ch)) continue;
827 if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
828 matchStart, 1, this.options)))
829 break;
830 }
831 }
832 }
833
834 /*
835 * Straightforward matching.
836 */
837 else {
838 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
839 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
840 break;
841 }
842 }
843
844 if (matchEnd >= 0) {
845 if (con.match != null) {
846 con.match.setBeginning(0, matchStart);
847 con.match.setEnd(0, matchEnd);
848 }
849 con.inuse = false;
850 return true;
851 } else {
852 con.inuse = false;
853 return false;
854 }
855 }
856
857 /**
858 * @return -1 when not match; offset of the end of matched string when match.
859 */
860 private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
861
862 char[] target = con.charTarget;
863
864
865 while (true) {
866 if (op == null)
867 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
868 if (offset > con.limit || offset < con.start)
869 return -1;
870 switch (op.type) {
871 case Op.CHAR:
872 if (isSet(opts, IGNORE_CASE)) {
873 int ch = op.getData();
874 if (dx > 0) {
875 if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] ))
876 return -1;
877 offset ++;
878 } else {
879 int o1 = offset-1;
880 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] ))
881 return -1;
882 offset = o1;
883 }
884 } else {
885 int ch = op.getData();
886 if (dx > 0) {
887 if (offset >= con.limit || ch != target [ offset ] )
888 return -1;
889 offset ++;
890 } else {
891 int o1 = offset-1;
892 if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] )
893 return -1;
894 offset = o1;
895 }
896 }
897 op = op.next;
898 break;
899
900 case Op.DOT:
901 if (dx > 0) {
902 if (offset >= con.limit)
903 return -1;
904 int ch = target [ offset ] ;
905 if (isSet(opts, SINGLE_LINE)) {
906 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
907 offset ++;
908 } else {
909 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
910 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
911 if (isEOLChar(ch))
912 return -1;
913 }
914 offset ++;
915 } else {
916 int o1 = offset-1;
917 if (o1 >= con.limit || o1 < 0)
918 return -1;
919 int ch = target [ o1 ] ;
920 if (isSet(opts, SINGLE_LINE)) {
921 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
922 o1 --;
923 } else {
924 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
925 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
926 if (!isEOLChar(ch))
927 return -1;
928 }
929 offset = o1;
930 }
931 op = op.next;
932 break;
933
934 case Op.RANGE:
935 case Op.NRANGE:
936 if (dx > 0) {
937 if (offset >= con.limit)
938 return -1;
939 int ch = target [ offset ] ;
940 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
941 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
942 RangeToken tok = op.getToken();
943 if (isSet(opts, IGNORE_CASE)) {
944 tok = tok.getCaseInsensitiveToken();
945 if (!tok.match(ch)) {
946 if (ch >= 0x10000) return -1;
947 char uch;
948 if (!tok.match(uch = Character.toUpperCase((char)ch))
949 && !tok.match(Character.toLowerCase(uch)))
950 return -1;
951 }
952 } else {
953 if (!tok.match(ch)) return -1;
954 }
955 offset ++;
956 } else {
957 int o1 = offset-1;
958 if (o1 >= con.limit || o1 < 0)
959 return -1;
960 int ch = target [ o1 ] ;
961 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
962 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
963 RangeToken tok = op.getToken();
964 if (isSet(opts, IGNORE_CASE)) {
965 tok = tok.getCaseInsensitiveToken();
966 if (!tok.match(ch)) {
967 if (ch >= 0x10000) return -1;
968 char uch;
969 if (!tok.match(uch = Character.toUpperCase((char)ch))
970 && !tok.match(Character.toLowerCase(uch)))
971 return -1;
972 }
973 } else {
974 if (!tok.match(ch)) return -1;
975 }
976 offset = o1;
977 }
978 op = op.next;
979 break;
980
981 case Op.ANCHOR:
982 boolean go = false;
983 switch (op.getData()) {
984 case '^':
985 if (isSet(opts, MULTIPLE_LINES)) {
986 if (!(offset == con.start
987 || offset > con.start && isEOLChar( target [ offset-1 ] )))
988 return -1;
989 } else {
990 if (offset != con.start)
991 return -1;
992 }
993 break;
994
995 case '@': // Internal use only.
996 // The @ always matches line beginnings.
997 if (!(offset == con.start
998 || offset > con.start && isEOLChar( target [ offset-1 ] )))
999 return -1;
1000 break;
1001
1002 case '$':
1003 if (isSet(opts, MULTIPLE_LINES)) {
1004 if (!(offset == con.limit
1005 || offset < con.limit && isEOLChar( target [ offset ] )))
1006 return -1;
1007 } else {
1008 if (!(offset == con.limit
1009 || offset+1 == con.limit && isEOLChar( target [ offset ] )
1010 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1011 && target [ offset+1 ] == LINE_FEED))
1012 return -1;
1013 }
1014 break;
1015
1016 case 'A':
1017 if (offset != con.start) return -1;
1018 break;
1019
1020 case 'Z':
1021 if (!(offset == con.limit
1022 || offset+1 == con.limit && isEOLChar( target [ offset ] )
1023 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1024 && target [ offset+1 ] == LINE_FEED))
1025 return -1;
1026 break;
1027
1028 case 'z':
1029 if (offset != con.limit) return -1;
1030 break;
1031
1032 case 'b':
1033 if (con.length == 0) return -1;
1034 {
1035 int after = getWordType(target, con.start, con.limit, offset, opts);
1036 if (after == WT_IGNORE) return -1;
1037 int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1038 if (after == before) return -1;
1039 }
1040 break;
1041
1042 case 'B':
1043 if (con.length == 0)
1044 go = true;
1045 else {
1046 int after = getWordType(target, con.start, con.limit, offset, opts);
1047 go = after == WT_IGNORE
1048 || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1049 }
1050 if (!go) return -1;
1051 break;
1052
1053 case '<':
1054 if (con.length == 0 || offset == con.limit) return -1;
1055 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1056 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1057 return -1;
1058 break;
1059
1060 case '>':
1061 if (con.length == 0 || offset == con.start) return -1;
1062 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1063 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1064 return -1;
1065 break;
1066 } // switch anchor type
1067 op = op.next;
1068 break;
1069
1070 case Op.BACKREFERENCE:
1071 {
1072 int refno = op.getData();
1073 if (refno <= 0 || refno >= this.nofparen)
1074 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1075 if (con.match.getBeginning(refno) < 0
1076 || con.match.getEnd(refno) < 0)
1077 return -1; // ********
1078 int o2 = con.match.getBeginning(refno);
1079 int literallen = con.match.getEnd(refno)-o2;
1080 if (!isSet(opts, IGNORE_CASE)) {
1081 if (dx > 0) {
1082 if (!regionMatches(target, offset, con.limit, o2, literallen))
1083 return -1;
1084 offset += literallen;
1085 } else {
1086 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1087 return -1;
1088 offset -= literallen;
1089 }
1090 } else {
1091 if (dx > 0) {
1092 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1093 return -1;
1094 offset += literallen;
1095 } else {
1096 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1097 o2, literallen))
1098 return -1;
1099 offset -= literallen;
1100 }
1101 }
1102 }
1103 op = op.next;
1104 break;
1105 case Op.STRING:
1106 {
1107 String literal = op.getString();
1108 int literallen = literal.length();
1109 if (!isSet(opts, IGNORE_CASE)) {
1110 if (dx > 0) {
1111 if (!regionMatches(target, offset, con.limit, literal, literallen))
1112 return -1;
1113 offset += literallen;
1114 } else {
1115 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1116 return -1;
1117 offset -= literallen;
1118 }
1119 } else {
1120 if (dx > 0) {
1121 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1122 return -1;
1123 offset += literallen;
1124 } else {
1125 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1126 literal, literallen))
1127 return -1;
1128 offset -= literallen;
1129 }
1130 }
1131 }
1132 op = op.next;
1133 break;
1134
1135 case Op.CLOSURE:
1136 {
1137 /*
1138 * Saves current position to avoid
1139 * zero-width repeats.
1140 */
1141 int id = op.getData();
1142 if (id >= 0) {
1143 int previousOffset = con.offsets[id];
1144 if (previousOffset < 0 || previousOffset != offset) {
1145 con.offsets[id] = offset;
1146 } else {
1147 con.offsets[id] = -1;
1148 op = op.next;
1149 break;
1150 }
1151 }
1152
1153 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1154 if (id >= 0) con.offsets[id] = -1;
1155 if (ret >= 0) return ret;
1156 op = op.next;
1157 }
1158 break;
1159
1160 case Op.QUESTION:
1161 {
1162 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1163 if (ret >= 0) return ret;
1164 op = op.next;
1165 }
1166 break;
1167
1168 case Op.NONGREEDYCLOSURE:
1169 case Op.NONGREEDYQUESTION:
1170 {
1171 int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1172 if (ret >= 0) return ret;
1173 op = op.getChild();
1174 }
1175 break;
1176
1177 case Op.UNION:
1178 for (int i = 0; i < op.size(); i ++) {
1179 int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1180 if (DEBUG) {
1181 System.err.println("UNION: "+i+", ret="+ret);
1182 }
1183 if (ret >= 0) return ret;
1184 }
1185 return -1;
1186
1187 case Op.CAPTURE:
1188 int refno = op.getData();
1189 if (con.match != null && refno > 0) {
1190 int save = con.match.getBeginning(refno);
1191 con.match.setBeginning(refno, offset);
1192 int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1193 if (ret < 0) con.match.setBeginning(refno, save);
1194 return ret;
1195 } else if (con.match != null && refno < 0) {
1196 int index = -refno;
1197 int save = con.match.getEnd(index);
1198 con.match.setEnd(index, offset);
1199 int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1200 if (ret < 0) con.match.setEnd(index, save);
1201 return ret;
1202 }
1203 op = op.next;
1204 break;
1205
1206 case Op.LOOKAHEAD:
1207 if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1208 op = op.next;
1209 break;
1210 case Op.NEGATIVELOOKAHEAD:
1211 if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1212 op = op.next;
1213 break;
1214 case Op.LOOKBEHIND:
1215 if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
1216 op = op.next;
1217 break;
1218 case Op.NEGATIVELOOKBEHIND:
1219