Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/xmlbeans/impl/regex/RegularExpression.java


1   /*   Copyright 2004 The Apache Software Foundation
2    *
3    *   Licensed under the Apache License, Version 2.0 (the "License");
4    *   you may not use this file except in compliance with the License.
5    *   You may obtain a copy of the License at
6    *
7    *       http://www.apache.org/licenses/LICENSE-2.0
8    *
9    *   Unless required by applicable law or agreed to in writing, software
10   *   distributed under the License is distributed on an "AS IS" BASIS,
11   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   *   See the License for the specific language governing permissions and
13   *  limitations under the License.
14   */
15  
16  package org.apache.xmlbeans.impl.regex;
17  
18  import java.text.CharacterIterator;
19  
20  /**
21   * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
22   * This engine does not conform to the POSIX regular expression.
23   *
24   * <hr width="50%">
25   * <h3>How to use</h3>
26   *
27   * <dl>
28   *   <dt>A. Standard way
29   *   <dd>
30   * <pre>
31   * RegularExpression re = new RegularExpression(<var>regex</var>);
32   * if (re.matches(text)) { ... }
33   * </pre>
34   *
35   *   <dt>B. Capturing groups
36   *   <dd>
37   * <pre>
38   * RegularExpression re = new RegularExpression(<var>regex</var>);
39   * Match match = new Match();
40   * if (re.matches(text, match)) {
41   *     ... // You can refer captured texts with methods of the <code>Match</code> class.
42   * }
43   * </pre>
44   *
45   * </dl>
46   *
47   * <h4>Case-insensitive matching</h4>
48   * <pre>
49   * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
50   * if (re.matches(text) >= 0) { ...}
51   * </pre>
52   *
53   * <h4>Options</h4>
54   * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
55   *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
56   *    This <var>options</var> parameter consists of the following characters.
57   * </p>
58   * <dl>
59   *   <dt><a name="I_OPTION"><code>"i"</code></a>
60   *   <dd>This option indicates case-insensitive matching.
61   *   <dt><a name="M_OPTION"><code>"m"</code></a>
62   *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
63   *   <dt><a name="S_OPTION"><code>"s"</code></a>
64   *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
65   *   <dt><a name="U_OPTION"><code>"u"</code></a>
66   *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
67   *   <dt><a name="W_OPTION"><code>"w"</code></a>
68   *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
69   *      'Unicode Regular Expression Guidelines' Revision 4.
70   *      When "w" and "u" are specified at the same time,
71   *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
72   *   <dt><a name="COMMA_OPTION"><code>","</code></a>
73   *   <dd>The parser treats a comma in a character class as a range separator.
74   *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
75   *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
76   *
77   *   <dt><a name="X_OPTION"><code>"X"</code></a>
78   *   <dd class="REGEX">
79   *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
80   *       The <code>match()</code> method does not do subsring matching
81   *       but entire string matching.
82   *
83   * </dl>
84   * 
85   * <hr width="50%">
86   * <h3>Syntax</h3>
87   * <table border="1" bgcolor="#ddeeff">
88   *   <tr>
89   *    <td>
90   *     <h4>Differences from the Perl 5 regular expression</h4>
91   *     <ul>
92   *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
93   *      <li>Supports subtraction, union, and intersection operations for character classes.
94   *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
95   *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
96   *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
97   *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
98   *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
99   *     </ul>
100  *    </td>
101  *   </tr>
102  * </table>
103  *
104  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
105  * <ul>
106  *   <li>Character
107  *     <dl>
108  *       <dt class="REGEX"><kbd>.</kbd> (A period)
109  *       <dd>Matches any one character except the following characters.
110  *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
111  *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
112  *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
113  *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
114  *           it matches any character including the above four characters.
115  *
116  *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
117  *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
118  *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
119  *
120  *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
121  *       <dd>Matches a control character.
122  *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
123  *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
124  *           It matches a control character of which the character code is less than
125  *           the character code of the <var>C</var> by 0x0040.
126  *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
127  *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
128  *
129  *       <dt class="REGEX">a non-meta character
130  *       <dd>Matches the character.
131  *
132  *       <dt class="REGEX"><KBD>\</KBD> + a meta character
133  *       <dd>Matches the meta character.
134  *
135  *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
136  *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
137  *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
138  *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
139  *
140  *       <!--
141  *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
142  *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
143  *       -->
144  *
145  *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
146  *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
147  *
148  *       <dt class="REGEX"><kbd>\g</kbd>
149  *       <dd>Matches a grapheme.
150  *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
151  *
152  *       <dt class="REGEX"><kbd>\X</kbd>
153  *       <dd class="REGEX">Matches a combining character sequence.
154  *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
155  *     </dl>
156  *   </li>
157  *
158  *   <li>Character class
159  *     <dl>
160 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
161 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
162  *       <dd>Positive character class.  It matches a character in ranges.
163  *       <dd><var>R<sub>n</sub></var>:
164  *       <ul>
165  *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
166  *             <p>This range matches the character.
167  *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
168  *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
169 + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
170 + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
171  *             <p>...
172  *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
173  *             <p>These expressions specifies the same ranges as the following expressions.
174  *       </ul>
175  *       <p class="REGEX">Enumerated ranges are merged (union operation).
176  *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
177  *
178  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
179  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
180  *       <dd>Negative character class.  It matches a character not in ranges.
181  *
182  *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
183  *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
184  *       <dd>Subtraction or union or intersection for character classes.
185  *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
186  *       <dd>The result of this operations is a <u>positive character class</u>
187  *           even if an expression includes any negative character classes.
188  *           You have to take care on this in case-insensitive matching.
189  *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
190  *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
191  *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
192  *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
193  *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
194  *
195  *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
196  *       <dd>Character class subtraction for the XML Schema.
197  *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
198  *           
199  *       <dt class="REGEX"><kbd>\d</kbd>
200  *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
201  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
202  *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
203  *
204  *       <dt class="REGEX"><kbd>\D</kbd>
205  *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
206  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
207  *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
208  *
209  *       <dt class="REGEX"><kbd>\s</kbd>
210  *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
211  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
212  *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
213  *
214  *       <dt class="REGEX"><kbd>\S</kbd>
215  *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
216  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
217  *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
218  *
219  *       <dt class="REGEX"><kbd>\w</kbd>
220  *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
221  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
222  *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
223  *
224  *       <dt class="REGEX"><kbd>\W</kbd>
225  *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
226  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
227  *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
228  *
229  *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
230  *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
231  *       The following names are available:
232  *       <dl>
233  *         <dt>Unicode General Categories:
234  *         <dd><kbd>
235  *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
236  *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
237  *         </kbd>
238  *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
239  *         <dt>Unicode Blocks:
240  *         <dd><kbd>
241  *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
242  *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
243  *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
244  *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
245  *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
246  *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
247  *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
248  *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
249  *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
250  *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
251  *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
252  *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
253  *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
254  *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
255  *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
256  *       Small Form Variants, Arabic Presentation Forms-B, Specials,
257  *       Halfwidth and Fullwidth Forms
258  *         </kbd>
259  *         <dt>Others:
260  *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
261  *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
262  *         <dd><kbd>UNASSGINED</kbd>
263  *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
264  *       </dl>
265  *
266  *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
267  *       <dd>Matches one character not in the specified General Category or the specified Block.
268  *     </dl>
269  *   </li>
270  *
271  *   <li>Selection and Quantifier
272  *     <dl>
273  *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
274  *       <dd>...
275  *
276  *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
277  *       <dd>Matches 0 or more <var>X</var>.
278  *
279  *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
280  *       <dd>Matches 1 or more <var>X</var>.
281  *
282  *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
283  *       <dd>Matches 0 or 1 <var>X</var>.
284  *
285  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
286  *       <dd>Matches <var>number</var> times.
287  *
288  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
289  *       <dd>...
290  *
291  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
292  *       <dd>...
293  *
294  *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
295  *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
296  *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
297  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
298  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
299  *       <dd>Non-greedy matching.
300  *     </dl>
301  *   </li>
302  *
303  *   <li>Grouping, Capturing, and Back-reference
304  *     <dl>
305  *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
306  *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
307  *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
308  *       you have to write "<KBD>(?:foo)+</KBD>".
309  *
310  *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
311  *       <dd>Grouping with capturing.
312  * It make a group and applications can know
313  * where in target text a group matched with methods of a <code>Match</code> instance
314  * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
315  * The 0th group means whole of this regular expression.
316  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
317  * 
318  *   <p>For instance, a regular expression is
319  *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
320  *   and target text is
321  *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
322  *   <ul>
323  *     <li><code>Match.getCapturedText(0)</code>:
324  *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
325  *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
326  *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
327  *   </ul>
328  *
329  *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
330  *       <dd>
331  *
332  *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
333  *       <dd>Independent expression group. ................
334  *
335  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
336  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
337  *       <dd>............................
338  *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
339  *           Note that it can not contain 'u'.
340  *
341  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
342  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
343  *       <dd>......
344  *       <dd>These expressions must be at the beginning of a group.
345  *     </dl>
346  *   </li>
347  *
348  *   <li>Anchor
349  *     <dl>
350  *       <dt class="REGEX"><kbd>\A</kbd>
351  *       <dd>Matches the beginnig of the text.
352  *
353  *       <dt class="REGEX"><kbd>\Z</kbd>
354  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
355  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
356  *
357  *       <dt class="REGEX"><kbd>\z</kbd>
358  *       <dd>Matches the end of the text.
359  *
360  *       <dt class="REGEX"><kbd>^</kbd>
361  *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
362  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
363  *           it matches the beginning of the text, or after one of EOL characters (
364  *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
365  *           PARAGRAPH SEPARATOR (U+2029).)
366  *
367  *       <dt class="REGEX"><kbd>$</kbd>
368  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
369  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
370  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
371  *           it matches the end of the text, or before an EOL character.
372  *
373  *       <dt class="REGEX"><kbd>\b</kbd>
374  *       <dd>Matches word boundary.
375  *           (See <a href="#W_OPTION">a "w" option</a>)
376  *
377  *       <dt class="REGEX"><kbd>\B</kbd>
378  *       <dd>Matches non word boundary.
379  *           (See <a href="#W_OPTION">a "w" option</a>)
380  *
381  *       <dt class="REGEX"><kbd>\&lt;</kbd>
382  *       <dd>Matches the beginning of a word.
383  *           (See <a href="#W_OPTION">a "w" option</a>)
384  *
385  *       <dt class="REGEX"><kbd>\&gt;</kbd>
386  *       <dd>Matches the end of a word.
387  *           (See <a href="#W_OPTION">a "w" option</a>)
388  *     </dl>
389  *   </li>
390  *   <li>Lookahead and lookbehind
391  *     <dl>
392  *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
393  *       <dd>Lookahead.
394  *
395  *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
396  *       <dd>Negative lookahead.
397  *
398  *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
399  *       <dd>Lookbehind.
400  *       <dd>(Note for text capturing......)
401  *
402  *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
403  *       <dd>Negative lookbehind.
404  *     </dl>
405  *   </li>
406  *
407  *   <li>Misc.
408  *     <dl>
409  *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
410  *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
411  *       <dd>......
412  *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
413  *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
414  *           You can not write comments in character classes and before quantifiers.
415  *     </dl>
416  *   </li>
417  * </ul>
418  *
419  *
420  * <hr width="50%">
421  * <h3>BNF for the regular expression</h3>
422  * <pre>
423  * regex ::= ('(?' options ')')? term ('|' term)*
424  * term ::= factor+
425  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
426  *            | '(?#' [^)]* ')'
427  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
428  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
429  *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
430  *          | '(?>' regex ')' | '(?' options ':' regex ')'
431  *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
432  * options ::= [imsw]* ('-' [imsw]+)?
433  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
434  * looks ::= '(?=' regex ')'  | '(?!' regex ')'
435  *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
436  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
437  * category-block ::= '\' [pP] category-symbol-1
438  *                    | ('\p{' | '\P{') (category-symbol | block-name
439  *                                       | other-properties) '}'
440  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
441  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
442  *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
443  *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
444  *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
445  *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
446  * block-name ::= (See above)
447  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
448  * character-1 ::= (any character except meta-characters)
449  *
450  * char-class ::= '[' ranges ']'
451  *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
452  * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
453  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
454  *           | range-char | range-char '-' range-char
455  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
456  * code-point ::= '\x' hex-char hex-char
457  *                | '\x{' hex-char+ '}'
458  * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
459  * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
460  * hex-char ::= [0-9a-fA-F]
461  * character-2 ::= (any character except \[]-,)
462  * </pre>
463  *
464  * <hr width="50%">
465  * <h3>TODO</h3>
466  * <ul>
467  *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
468  *     <ul>
469  *       <li>2.4 Canonical Equivalents
470  *       <li>Level 3
471  *     </ul>
472  *   <li>Parsing performance
473  * </ul>
474  *
475  * <hr width="50%">
476  *
477  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
478  * @version $Id: RegularExpression.java 111285 2004-12-08 16:54:26Z cezar $
479  */
480 public class RegularExpression implements java.io.Serializable {
481     static final boolean DEBUG = false;
482 
483     /**
484      * Compiles a token tree into an operation flow.
485      */
486     private synchronized void compile(Token tok) {
487         if (this.operations != null)
488             return;
489         this.numberOfClosures = 0;
490         this.operations = this.compile(tok, null, false);
491     }
492 
493     /**
494      * Converts a token to an operation.
495      */
496     private Op compile(Token tok, Op next, boolean reverse) {
497         Op ret;
498         switch (tok.type) {
499         case Token.DOT:
500             ret = Op.createDot();
501             ret.next = next;
502             break;
503 
504         case Token.CHAR:
505             ret = Op.createChar(tok.getChar());
506             ret.next = next;
507             break;
508 
509         case Token.ANCHOR:
510             ret = Op.createAnchor(tok.getChar());
511             ret.next = next;
512             break;
513 
514         case Token.RANGE:
515         case Token.NRANGE:
516             ret = Op.createRange(tok);
517             ret.next = next;
518             break;
519 
520         case Token.CONCAT:
521             ret = next;
522             if (!reverse) {
523                 for (int i = tok.size()-1;  i >= 0;  i --) {
524                     ret = compile(tok.getChild(i), ret, false);
525                 }
526             } else {
527                 for (int i = 0;  i < tok.size();  i ++) {
528                     ret = compile(tok.getChild(i), ret, true);
529                 }
530             }
531             break;
532 
533         case Token.UNION:
534             Op.UnionOp uni = Op.createUnion(tok.size());
535             for (int i = 0;  i < tok.size();  i ++) {
536                 uni.addElement(compile(tok.getChild(i), next, reverse));
537             }
538             ret = uni;                          // ret.next is null.
539             break;
540 
541         case Token.CLOSURE:
542         case Token.NONGREEDYCLOSURE:
543             Token child = tok.getChild(0);
544             int min = tok.getMin();
545             int max = tok.getMax();
546             if (min >= 0 && min == max) { // {n}
547                 ret = next;
548                 for (int i = 0; i < min;  i ++) {
549                     ret = compile(child, ret, reverse);
550                 }
551                 break;
552             }
553             if (min > 0 && max > 0)
554                 max -= min;
555             if (max > 0) {
556                 // X{2,6} -> XX(X(X(XX?)?)?)?
557                 ret = next;
558                 for (int i = 0;  i < max;  i ++) {
559                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
560                     q.next = next;
561                     q.setChild(compile(child, ret, reverse));
562                     ret = q;
563                 }
564             } else {
565                 Op.ChildOp op;
566                 if (tok.type == Token.NONGREEDYCLOSURE) {
567                     op = Op.createNonGreedyClosure();
568                 } else {                        // Token.CLOSURE
569                     if (child.getMinLength() == 0)
570                         op = Op.createClosure(this.numberOfClosures++);
571                     else
572                         op = Op.createClosure(-1);
573                 }
574                 op.next = next;
575                 op.setChild(compile(child, op, reverse));
576                 ret = op;
577             }
578             if (min > 0) {
579                 for (int i = 0;  i < min;  i ++) {
580                     ret = compile(child, ret, reverse);
581                 }
582             }
583             break;
584 
585         case Token.EMPTY:
586             ret = next;
587             break;
588 
589         case Token.STRING:
590             ret = Op.createString(tok.getString());
591             ret.next = next;
592             break;
593 
594         case Token.BACKREFERENCE:
595             ret = Op.createBackReference(tok.getReferenceNumber());
596             ret.next = next;
597             break;
598 
599         case Token.PAREN:
600             if (tok.getParenNumber() == 0) {
601                 ret = compile(tok.getChild(0), next, reverse);
602             } else if (reverse) {
603                 next = Op.createCapture(tok.getParenNumber(), next);
604                 next = compile(tok.getChild(0), next, reverse);
605                 ret = Op.createCapture(-tok.getParenNumber(), next);
606             } else {
607                 next = Op.createCapture(-tok.getParenNumber(), next);
608                 next = compile(tok.getChild(0), next, reverse);
609                 ret = Op.createCapture(tok.getParenNumber(), next);
610             }
611             break;
612 
613         case Token.LOOKAHEAD:
614             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
615             break;
616         case Token.NEGATIVELOOKAHEAD:
617             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
618             break;
619         case Token.LOOKBEHIND:
620             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
621             break;
622         case Token.NEGATIVELOOKBEHIND:
623             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
624             break;
625 
626         case Token.INDEPENDENT:
627             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
628             break;
629 
630         case Token.MODIFIERGROUP:
631             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
632                                     ((Token.ModifierToken)tok).getOptions(),
633                                     ((Token.ModifierToken)tok).getOptionsMask());
634             break;
635 
636         case Token.CONDITION:
637             Token.ConditionToken ctok = (Token.ConditionToken)tok;
638             int ref = ctok.refNumber;
639             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
640             Op yes = compile(ctok.yes, next, reverse);
641             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
642             ret = Op.createCondition(next, ref, condition, yes, no);
643             break;
644 
645         default:
646             throw new RuntimeException("Unknown token type: "+tok.type);
647         } // switch (tok.type)
648         return ret;
649     }
650 
651 
652 //Public
653 
654     /**
655      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
656      *
657      * @return true if the target is matched to this regular expression.
658      */
659     public boolean matches(char[]  target) {
660         return this.matches(target, 0,  target .length , (Match)null);
661     }
662 
663     /**
664      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
665      * in specified range or not.
666      *
667      * @param start Start offset of the range.
668      * @param end  End offset +1 of the range.
669      * @return true if the target is matched to this regular expression.
670      */
671     public boolean matches(char[]  target, int start, int end) {
672         return this.matches(target, start, end, (Match)null);
673     }
674 
675     /**
676      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
677      *
678      * @param match A Match instance for storing matching result.
679      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
680      */
681     public boolean matches(char[]  target, Match match) {
682         return this.matches(target, 0,  target .length , match);
683     }
684 
685 
686     /**
687      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
688      * in specified range or not.
689      *
690      * @param start Start offset of the range.
691      * @param end  End offset +1 of the range.
692      * @param match A Match instance for storing matching result.
693      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
694      */
695     public boolean matches(char[]  target, int start, int end, Match match) {
696 
697         synchronized (this) {
698             if (this.operations == null)
699                 this.prepare();
700             if (this.context == null)
701                 this.context = new Context();
702         }
703         Context con = null;
704         synchronized (this.context) {
705             con = this.context.inuse ? new Context() : this.context;
706             con.reset(target, start, end, this.numberOfClosures);
707         }
708         if (match != null) {
709             match.setNumberOfGroups(this.nofparen);
710             match.setSource(target);
711         } else if (this.hasBackReferences) {
712             match = new Match();
713             match.setNumberOfGroups(this.nofparen);
714             // Need not to call setSource() because
715             // a caller can not access this match instance.
716         }
717         con.match = match;
718 
719         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
720             int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
721             //System.err.println("DEBUG: matchEnd="+matchEnd);
722             if (matchEnd == con.limit) {
723                 if (con.match != null) {
724                     con.match.setBeginning(0, con.start);
725                     con.match.setEnd(0, matchEnd);
726                 }
727                 con.inuse = false;
728                 return true;
729             }
730             return false;
731         }
732 
733         /*
734          * The pattern has only fixed string.
735          * The engine uses Boyer-Moore.
736          */
737         if (this.fixedStringOnly) {
738             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
739             int o = this.fixedStringTable.matches(target, con.start, con.limit);
740             if (o >= 0) {
741                 if (con.match != null) {
742                     con.match.setBeginning(0, o);
743                     con.match.setEnd(0, o+this.fixedString.length());
744                 }
745                 con.inuse = false;
746                 return true;
747             }
748             con.inuse = false;
749             return false;
750         }
751 
752         /*
753          * The pattern contains a fixed string.
754          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
755          * If not, it return with false.
756          */
757         if (this.fixedString != null) {
758             int o = this.fixedStringTable.matches(target, con.start, con.limit);
759             if (o < 0) {
760                 //System.err.println("Non-match in fixed-string search.");
761                 con.inuse = false;
762                 return false;
763             }
764         }
765 
766         int limit = con.limit-this.minlength;
767         int matchStart;
768         int matchEnd = -1;
769 
770         /*
771          * Checks whether the expression starts with ".*".
772          */
773         if (this.operations != null
774             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
775             if (isSet(this.options, SINGLE_LINE)) {
776                 matchStart = con.start;
777                 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
778             } else {
779                 boolean previousIsEOL = true;
780                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
781                     int ch =  target [  matchStart ] ;
782                     if (isEOLChar(ch)) {
783                         previousIsEOL = true;
784                     } else {
785                         if (previousIsEOL) {
786                             if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
787                                                                        matchStart, 1, this.options)))
788                                 break;
789                         }
790                         previousIsEOL = false;
791                     }
792                 }
793             }
794         }
795 
796         /*
797          * Optimization against the first character.
798          */
799         else if (this.firstChar != null) {
800             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
801             RangeToken range = this.firstChar;
802             if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
803                 range = this.firstChar.getCaseInsensitiveToken();
804                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
805                     int ch =  target [  matchStart ] ;
806                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
807                         ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
808                         if (!range.match(ch))  continue;
809                     } else {
810                         if (!range.match(ch)) {
811                             char ch1 = Character.toUpperCase((char)ch);
812                             if (!range.match(ch1))
813                                 if (!range.match(Character.toLowerCase(ch1)))
814                                     continue;
815                         }
816                     }
817                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
818                                                                matchStart, 1, this.options)))
819                         break;
820                 }
821             } else {
822                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
823                     int ch =  target [  matchStart ] ;
824                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
825                         ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
826                     if (!range.match(ch))  continue;
827                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
828                                                                matchStart, 1, this.options)))
829                         break;
830                 }
831             }
832         }
833 
834         /*
835          * Straightforward matching.
836          */
837         else {
838             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
839                 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
840                     break;
841             }
842         }
843 
844         if (matchEnd >= 0) {
845             if (con.match != null) {
846                 con.match.setBeginning(0, matchStart);
847                 con.match.setEnd(0, matchEnd);
848             }
849             con.inuse = false;
850             return true;
851         } else {
852             con.inuse = false;
853             return false;
854         }
855     }
856 
857 /**
858  * @return -1 when not match; offset of the end of matched string when match.
859  */
860     private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
861 
862         char[] target = con.charTarget;
863 
864 
865         while (true) {
866             if (op == null)
867                 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
868             if (offset > con.limit || offset < con.start)
869                 return -1;
870             switch (op.type) {
871             case Op.CHAR:
872                 if (isSet(opts, IGNORE_CASE)) {
873                     int ch = op.getData();
874                     if (dx > 0) {
875                         if (offset >= con.limit || !matchIgnoreCase(ch,  target [  offset ] ))
876                             return -1;
877                         offset ++;
878                     } else {
879                         int o1 = offset-1;
880                         if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target [  o1 ] ))
881                             return -1;
882                         offset = o1;
883                     }
884                 } else {
885                     int ch = op.getData();
886                     if (dx > 0) {
887                         if (offset >= con.limit || ch !=  target [  offset ] )
888                             return -1;
889                         offset ++;
890                     } else {
891                         int o1 = offset-1;
892                         if (o1 >= con.limit || o1 < 0 || ch !=  target [  o1 ] )
893                             return -1;
894                         offset = o1;
895                     }
896                 }
897                 op = op.next;
898                 break;
899 
900             case Op.DOT:
901                 if (dx > 0) {
902                     if (offset >= con.limit)
903                         return -1;
904                     int ch =  target [  offset ] ;
905                     if (isSet(opts, SINGLE_LINE)) {
906                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
907                             offset ++;
908                     } else {
909                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
910                             ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
911                         if (isEOLChar(ch))
912                             return -1;
913                     }
914                     offset ++;
915                 } else {
916                     int o1 = offset-1;
917                     if (o1 >= con.limit || o1 < 0)
918                         return -1;
919                     int ch =  target [  o1 ] ;
920                     if (isSet(opts, SINGLE_LINE)) {
921                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
922                             o1 --;
923                     } else {
924                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
925                             ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
926                         if (!isEOLChar(ch))
927                             return -1;
928                     }
929                     offset = o1;
930                 }
931                 op = op.next;
932                 break;
933 
934             case Op.RANGE:
935             case Op.NRANGE:
936                 if (dx > 0) {
937                     if (offset >= con.limit)
938                         return -1;
939                     int ch =  target [  offset ] ;
940                     if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
941                         ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
942                     RangeToken tok = op.getToken();
943                     if (isSet(opts, IGNORE_CASE)) {
944                         tok = tok.getCaseInsensitiveToken();
945                         if (!tok.match(ch)) {
946                             if (ch >= 0x10000)  return -1;
947                             char uch;
948                             if (!tok.match(uch = Character.toUpperCase((char)ch))
949                                 && !tok.match(Character.toLowerCase(uch)))
950                                 return -1;
951                         }
952                     } else {
953                         if (!tok.match(ch))  return -1;
954                     }
955                     offset ++;
956                 } else {
957                     int o1 = offset-1;
958                     if (o1 >= con.limit || o1 < 0)
959                         return -1;
960                     int ch =  target [  o1 ] ;
961                     if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
962                         ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
963                     RangeToken tok = op.getToken();
964                     if (isSet(opts, IGNORE_CASE)) {
965                         tok = tok.getCaseInsensitiveToken();
966                         if (!tok.match(ch)) {
967                             if (ch >= 0x10000)  return -1;
968                             char uch;
969                             if (!tok.match(uch = Character.toUpperCase((char)ch))
970                                 && !tok.match(Character.toLowerCase(uch)))
971                                 return -1;
972                         }
973                     } else {
974                         if (!tok.match(ch))  return -1;
975                     }
976                     offset = o1;
977                 }
978                 op = op.next;
979                 break;
980 
981             case Op.ANCHOR:
982                 boolean go = false;
983                 switch (op.getData()) {
984                 case '^':
985                     if (isSet(opts, MULTIPLE_LINES)) {
986                         if (!(offset == con.start
987                               || offset > con.start && isEOLChar( target [  offset-1 ] )))
988                             return -1;
989                     } else {
990                         if (offset != con.start)
991                             return -1;
992                     }
993                     break;
994 
995                 case '@':                         // Internal use only.
996                     // The @ always matches line beginnings.
997                     if (!(offset == con.start
998                           || offset > con.start && isEOLChar( target [  offset-1 ] )))
999                         return -1;
1000                    break;
1001
1002                case '$':
1003                    if (isSet(opts, MULTIPLE_LINES)) {
1004                        if (!(offset == con.limit
1005                              || offset < con.limit && isEOLChar( target [  offset ] )))
1006                            return -1;
1007                    } else {
1008                        if (!(offset == con.limit
1009                              || offset+1 == con.limit && isEOLChar( target [  offset ] )
1010                              || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
1011                              &&  target [  offset+1 ]  == LINE_FEED))
1012                            return -1;
1013                    }
1014                    break;
1015
1016                case 'A':
1017                    if (offset != con.start)  return -1;
1018                    break;
1019
1020                case 'Z':
1021                    if (!(offset == con.limit
1022                          || offset+1 == con.limit && isEOLChar( target [  offset ] )
1023                          || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
1024                          &&  target [  offset+1 ]  == LINE_FEED))
1025                        return -1;
1026                    break;
1027
1028                case 'z':
1029                    if (offset != con.limit)  return -1;
1030                    break;
1031
1032                case 'b':
1033                    if (con.length == 0)  return -1;
1034                    {
1035                        int after = getWordType(target, con.start, con.limit, offset, opts);
1036                        if (after == WT_IGNORE)  return -1;
1037                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1038                        if (after == before)  return -1;
1039                    }
1040                    break;
1041
1042                case 'B':
1043                    if (con.length == 0)
1044                        go = true;
1045                    else {
1046                        int after = getWordType(target, con.start, con.limit, offset, opts);
1047                        go = after == WT_IGNORE
1048                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1049                    }
1050                    if (!go)  return -1;
1051                    break;
1052
1053                case '<':
1054                    if (con.length == 0 || offset == con.limit)  return -1;
1055                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1056                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1057                        return -1;
1058                    break;
1059
1060                case '>':
1061                    if (con.length == 0 || offset == con.start)  return -1;
1062                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1063                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1064                        return -1;
1065                    break;
1066                } // switch anchor type
1067                op = op.next;
1068                break;
1069
1070            case Op.BACKREFERENCE:
1071                {
1072                    int refno = op.getData();
1073                    if (refno <= 0 || refno >= this.nofparen)
1074                        throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1075                    if (con.match.getBeginning(refno) < 0
1076                        || con.match.getEnd(refno) < 0)
1077                        return -1;                // ********
1078                    int o2 = con.match.getBeginning(refno);
1079                    int literallen = con.match.getEnd(refno)-o2;
1080                    if (!isSet(opts, IGNORE_CASE)) {
1081                        if (dx > 0) {
1082                            if (!regionMatches(target, offset, con.limit, o2, literallen))
1083                                return -1;
1084                            offset += literallen;
1085                        } else {
1086                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1087                                return -1;
1088                            offset -= literallen;
1089                        }
1090                    } else {
1091                        if (dx > 0) {
1092                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1093                                return -1;
1094                            offset += literallen;
1095                        } else {
1096                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1097                                                         o2, literallen))
1098                                return -1;
1099                            offset -= literallen;
1100                        }
1101                    }
1102                }
1103                op = op.next;
1104                break;
1105            case Op.STRING:
1106                {
1107                    String literal = op.getString();
1108                    int literallen = literal.length();
1109                    if (!isSet(opts, IGNORE_CASE)) {
1110                        if (dx > 0) {
1111                            if (!regionMatches(target, offset, con.limit, literal, literallen))
1112                                return -1;
1113                            offset += literallen;
1114                        } else {
1115                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1116                                return -1;
1117                            offset -= literallen;
1118                        }
1119                    } else {
1120                        if (dx > 0) {
1121                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1122                                return -1;
1123                            offset += literallen;
1124                        } else {
1125                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1126                                                         literal, literallen))
1127                                return -1;
1128                            offset -= literallen;
1129                        }
1130                    }
1131                }
1132                op = op.next;
1133                break;
1134
1135            case Op.CLOSURE:
1136                {
1137                    /*
1138                     * Saves current position to avoid
1139                     * zero-width repeats.
1140                     */
1141                    int id = op.getData();
1142                    if (id >= 0) {
1143                        int previousOffset = con.offsets[id];
1144                        if (previousOffset < 0 || previousOffset != offset) {
1145                            con.offsets[id] = offset;
1146                        } else {
1147                            con.offsets[id] = -1;
1148                            op = op.next;
1149                            break;
1150                        }
1151                    }
1152
1153                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1154                    if (id >= 0)  con.offsets[id] = -1;
1155                    if (ret >= 0)  return ret;
1156                    op = op.next;
1157                }
1158                break;
1159
1160            case Op.QUESTION:
1161                {
1162                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1163                    if (ret >= 0)  return ret;
1164                    op = op.next;
1165                }
1166                break;
1167
1168            case Op.NONGREEDYCLOSURE:
1169            case Op.NONGREEDYQUESTION:
1170                {
1171                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1172                    if (ret >= 0)  return ret;
1173                    op = op.getChild();
1174                }
1175                break;
1176
1177            case Op.UNION:
1178                for (int i = 0;  i < op.size();  i ++) {
1179                    int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1180                    if (DEBUG) {
1181                        System.err.println("UNION: "+i+", ret="+ret);
1182                    }
1183                    if (ret >= 0)  return ret;
1184                }
1185                return -1;
1186
1187            case Op.CAPTURE:
1188                int refno = op.getData();
1189                if (con.match != null && refno > 0) {
1190                    int save = con.match.getBeginning(refno);
1191                    con.match.setBeginning(refno, offset);
1192                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1193                    if (ret < 0)  con.match.setBeginning(refno, save);
1194                    return ret;
1195                } else if (con.match != null && refno < 0) {
1196                    int index = -refno;
1197                    int save = con.match.getEnd(index);
1198                    con.match.setEnd(index, offset);
1199                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1200                    if (ret < 0)  con.match.setEnd(index, save);
1201                    return ret;
1202                }
1203                op = op.next;
1204                break;
1205
1206            case Op.LOOKAHEAD:
1207                if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
1208                op = op.next;
1209                break;
1210            case Op.NEGATIVELOOKAHEAD:
1211                if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
1212                op = op.next;
1213                break;
1214            case Op.LOOKBEHIND:
1215                if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts))  return -1;
1216                op = op.next;
1217                break;
1218            case Op.NEGATIVELOOKBEHIND:
1219