Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/xerces/utils/regex/RegularExpression.java


1   /*
2    * The Apache Software License, Version 1.1
3    *
4    *
5    * Copyright (c) 1999,2000 The Apache Software Foundation.  All rights 
6    * reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution,
21   *    if any, must include the following acknowledgment:  
22   *       "This product includes software developed by the
23   *        Apache Software Foundation (http://www.apache.org/)."
24   *    Alternately, this acknowledgment may appear in the software itself,
25   *    if and wherever such third-party acknowledgments normally appear.
26   *
27   * 4. The names "Xerces" and "Apache Software Foundation" must
28   *    not be used to endorse or promote products derived from this
29   *    software without prior written permission. For written 
30   *    permission, please contact apache@apache.org.
31   *
32   * 5. Products derived from this software may not be called "Apache",
33   *    nor may "Apache" appear in their name, without prior written
34   *    permission of the Apache Software Foundation.
35   *
36   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47   * SUCH DAMAGE.
48   * ====================================================================
49   *
50   * This software consists of voluntary contributions made by many
51   * individuals on behalf of the Apache Software Foundation and was
52   * originally based on software copyright (c) 1999, International
53   * Business Machines, Inc., http://www.apache.org.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  package org.apache.xerces.utils.regex;
59  
60  
61  import java.text.CharacterIterator;
62  
63  /**
64   * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
65   * This engine does not conform to the POSIX regular expression.
66   *
67   * <hr width="50%">
68   * <h3>How to use</h3>
69   *
70   * <dl>
71   *   <dt>A. Standard way
72   *   <dd>
73   * <pre>
74   * RegularExpression re = new RegularExpression(<var>regex</var>);
75   * if (re.matches(text)) { ... }
76   * </pre>
77   *
78   *   <dt>B. Capturing groups
79   *   <dd>
80   * <pre>
81   * RegularExpression re = new RegularExpression(<var>regex</var>);
82   * Match match = new Match();
83   * if (re.matches(text, match)) {
84   *     ... // You can refer captured texts with methods of the <code>Match</code> class.
85   * }
86   * </pre>
87   *
88   * </dl>
89   *
90   * <h4>Case-insensitive matching</h4>
91   * <pre>
92   * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
93   * if (re.matches(text) >= 0) { ...}
94   * </pre>
95   *
96   * <h4>Options</h4>
97   * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
98   *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
99   *    This <var>options</var> parameter consists of the following characters.
100  * </p>
101  * <dl>
102  *   <dt><a name="I_OPTION"><code>"i"</code></a>
103  *   <dd>This option indicates case-insensitive matching.
104  *   <dt><a name="M_OPTION"><code>"m"</code></a>
105  *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
106  *   <dt><a name="S_OPTION"><code>"s"</code></a>
107  *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
108  *   <dt><a name="U_OPTION"><code>"u"</code></a>
109  *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
110  *   <dt><a name="W_OPTION"><code>"w"</code></a>
111  *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
112  *      'Unicode Regular Expression Guidelines' Revision 4.
113  *      When "w" and "u" are specified at the same time,
114  *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
115  *   <dt><a name="COMMA_OPTION"><code>","</code></a>
116  *   <dd>The parser treats a comma in a character class as a range separator.
117  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
118  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
119  *
120  *   <dt><a name="X_OPTION"><code>"X"</code></a>
121  *   <dd class="REGEX">
122  *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
123  *       The <code>match()</code> method does not do subsring matching
124  *       but entire string matching.
125  *
126  * </dl>
127  * 
128  * <hr width="50%">
129  * <h3>Syntax</h3>
130  * <table border="1" bgcolor="#ddeeff">
131  *   <tr>
132  *    <td>
133  *     <h4>Differences from the Perl 5 regular expression</h4>
134  *     <ul>
135  *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
136  *      <li>Supports subtraction, union, and intersection operations for character classes.
137  *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
138  *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
139  *          <kbd>\u005cu</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
140  *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
141  *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
142  *     </ul>
143  *    </td>
144  *   </tr>
145  * </table>
146  *
147  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
148  * <ul>
149  *   <li>Character
150  *     <dl>
151  *       <dt class="REGEX"><kbd>.</kbd> (A period)
152  *       <dd>Matches any one character except the following characters.
153  *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
154  *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
155  *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
156  *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
157  *           it matches any character including the above four characters.
158  *
159  *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
160  *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
161  *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
162  *
163  *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
164  *       <dd>Matches a control character.
165  *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
166  *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
167  *           It matches a control character of which the character code is less than
168  *           the character code of the <var>C</var> by 0x0040.
169  *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
170  *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
171  *
172  *       <dt class="REGEX">a non-meta character
173  *       <dd>Matches the character.
174  *
175  *       <dt class="REGEX"><KBD>\</KBD> + a meta character
176  *       <dd>Matches the meta character.
177  *
178  *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
179  *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
180  *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
181  *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
182  *
183  *       <!--
184  *       <dt class="REGEX"><kbd>\u005cu</kbd><var>HHHH</var>
185  *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
186  *       -->
187  *
188  *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
189  *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
190  *
191  *       <dt class="REGEX"><kbd>\g</kbd>
192  *       <dd>Matches a grapheme.
193  *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
194  *
195  *       <dt class="REGEX"><kbd>\X</kbd>
196  *       <dd class="REGEX">Matches a combining character sequence.
197  *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
198  *     </dl>
199  *   </li>
200  *
201  *   <li>Character class
202  *     <dl>
203 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
204 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
205  *       <dd>Positive character class.  It matches a character in ranges.
206  *       <dd><var>R<sub>n</sub></var>:
207  *       <ul>
208  *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005cu</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
209  *             <p>This range matches the character.
210  *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
211  *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
212 + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
213 + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
214  *             <p>...
215  *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
216  *             <p>These expressions specifies the same ranges as the following expressions.
217  *       </ul>
218  *       <p class="REGEX">Enumerated ranges are merged (union operation).
219  *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
220  *
221  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
222  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
223  *       <dd>Negative character class.  It matches a character not in ranges.
224  *
225  *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
226  *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
227  *       <dd>Subtraction or union or intersection for character classes.
228  *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
229  *       <dd>The result of this operations is a <u>positive character class</u>
230  *           even if an expression includes any negative character classes.
231  *           You have to take care on this in case-insensitive matching.
232  *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
233  *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
234  *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
235  *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
236  *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
237  *
238  *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
239  *       <dd>Character class subtraction for the XML Schema.
240  *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
241  *           
242  *       <dt class="REGEX"><kbd>\d</kbd>
243  *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
244  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
245  *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
246  *
247  *       <dt class="REGEX"><kbd>\D</kbd>
248  *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
249  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
250  *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
251  *
252  *       <dt class="REGEX"><kbd>\s</kbd>
253  *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
254  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
255  *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
256  *
257  *       <dt class="REGEX"><kbd>\S</kbd>
258  *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
259  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
260  *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
261  *
262  *       <dt class="REGEX"><kbd>\w</kbd>
263  *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
264  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
265  *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
266  *
267  *       <dt class="REGEX"><kbd>\W</kbd>
268  *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
269  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
270  *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
271  *
272  *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
273  *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
274  *       The following names are available:
275  *       <dl>
276  *         <dt>Unicode General Categories:
277  *         <dd><kbd>
278  *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
279  *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
280  *         </kbd>
281  *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
282  *         <dt>Unicode Blocks:
283  *         <dd><kbd>
284  *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
285  *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
286  *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
287  *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
288  *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
289  *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
290  *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
291  *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
292  *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
293  *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
294  *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
295  *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
296  *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
297  *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
298  *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
299  *       Small Form Variants, Arabic Presentation Forms-B, Specials,
300  *       Halfwidth and Fullwidth Forms
301  *         </kbd>
302  *         <dt>Others:
303  *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
304  *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
305  *         <dd><kbd>UNASSGINED</kbd>
306  *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
307  *       </dl>
308  *
309  *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
310  *       <dd>Matches one character not in the specified General Category or the specified Block.
311  *     </dl>
312  *   </li>
313  *
314  *   <li>Selection and Quantifier
315  *     <dl>
316  *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
317  *       <dd>...
318  *
319  *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
320  *       <dd>Matches 0 or more <var>X</var>.
321  *
322  *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
323  *       <dd>Matches 1 or more <var>X</var>.
324  *
325  *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
326  *       <dd>Matches 0 or 1 <var>X</var>.
327  *
328  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
329  *       <dd>Matches <var>number</var> times.
330  *
331  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
332  *       <dd>...
333  *
334  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
335  *       <dd>...
336  *
337  *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
338  *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
339  *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
340  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
341  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
342  *       <dd>Non-greedy matching.
343  *     </dl>
344  *   </li>
345  *
346  *   <li>Grouping, Capturing, and Back-reference
347  *     <dl>
348  *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
349  *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
350  *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
351  *       you have to write "<KBD>(?:foo)+</KBD>".
352  *
353  *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
354  *       <dd>Grouping with capturing.
355  * It make a group and applications can know
356  * where in target text a group matched with methods of a <code>Match</code> instance
357  * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
358  * The 0th group means whole of this regular expression.
359  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
360  * 
361  *   <p>For instance, a regular expression is
362  *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
363  *   and target text is
364  *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
365  *   <ul>
366  *     <li><code>Match.getCapturedText(0)</code>:
367  *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
368  *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
369  *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
370  *   </ul>
371  *
372  *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
373  *       <dd>
374  *
375  *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
376  *       <dd>Independent expression group. ................
377  *
378  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
379  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
380  *       <dd>............................
381  *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
382  *           Note that it can not contain 'u'.
383  *
384  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
385  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
386  *       <dd>......
387  *       <dd>These expressions must be at the beginning of a group.
388  *     </dl>
389  *   </li>
390  *
391  *   <li>Anchor
392  *     <dl>
393  *       <dt class="REGEX"><kbd>\A</kbd>
394  *       <dd>Matches the beginnig of the text.
395  *
396  *       <dt class="REGEX"><kbd>\Z</kbd>
397  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
398  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
399  *
400  *       <dt class="REGEX"><kbd>\z</kbd>
401  *       <dd>Matches the end of the text.
402  *
403  *       <dt class="REGEX"><kbd>^</kbd>
404  *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
405  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
406  *           it matches the beginning of the text, or after one of EOL characters (
407  *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
408  *           PARAGRAPH SEPARATOR (U+2029).)
409  *
410  *       <dt class="REGEX"><kbd>$</kbd>
411  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
412  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
413  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
414  *           it matches the end of the text, or before an EOL character.
415  *
416  *       <dt class="REGEX"><kbd>\b</kbd>
417  *       <dd>Matches word boundary.
418  *           (See <a href="#W_OPTION">a "w" option</a>)
419  *
420  *       <dt class="REGEX"><kbd>\B</kbd>
421  *       <dd>Matches non word boundary.
422  *           (See <a href="#W_OPTION">a "w" option</a>)
423  *
424  *       <dt class="REGEX"><kbd>\&lt;</kbd>
425  *       <dd>Matches the beginning of a word.
426  *           (See <a href="#W_OPTION">a "w" option</a>)
427  *
428  *       <dt class="REGEX"><kbd>\&gt;</kbd>
429  *       <dd>Matches the end of a word.
430  *           (See <a href="#W_OPTION">a "w" option</a>)
431  *     </dl>
432  *   </li>
433  *   <li>Lookahead and lookbehind
434  *     <dl>
435  *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
436  *       <dd>Lookahead.
437  *
438  *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
439  *       <dd>Negative lookahead.
440  *
441  *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
442  *       <dd>Lookbehind.
443  *       <dd>(Note for text capturing......)
444  *
445  *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
446  *       <dd>Negative lookbehind.
447  *     </dl>
448  *   </li>
449  *
450  *   <li>Misc.
451  *     <dl>
452  *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
453  *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
454  *       <dd>......
455  *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
456  *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
457  *           You can not write comments in character classes and before quantifiers.
458  *     </dl>
459  *   </li>
460  * </ul>
461  *
462  *
463  * <hr width="50%">
464  * <h3>BNF for the regular expression</h3>
465  * <pre>
466  * regex ::= ('(?' options ')')? term ('|' term)*
467  * term ::= factor+
468  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
469  *            | '(?#' [^)]* ')'
470  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
471  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
472  *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
473  *          | '(?>' regex ')' | '(?' options ':' regex ')'
474  *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
475  * options ::= [imsw]* ('-' [imsw]+)?
476  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
477  * looks ::= '(?=' regex ')'  | '(?!' regex ')'
478  *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
479  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
480  * category-block ::= '\' [pP] category-symbol-1
481  *                    | ('\p{' | '\P{') (category-symbol | block-name
482  *                                       | other-properties) '}'
483  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
484  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
485  *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
486  *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
487  *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
488  *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
489  * block-name ::= (See above)
490  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
491  * character-1 ::= (any character except meta-characters)
492  *
493  * char-class ::= '[' ranges ']'
494  *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
495  * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
496  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
497  *           | range-char | range-char '-' range-char
498  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
499  * code-point ::= '\x' hex-char hex-char
500  *                | '\x{' hex-char+ '}'
501  * <!--               | '\u005cu' hex-char hex-char hex-char hex-char
502  * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
503  * hex-char ::= [0-9a-fA-F]
504  * character-2 ::= (any character except \[]-,)
505  * </pre>
506  *
507  * <hr width="50%">
508  * <h3>TODO</h3>
509  * <ul>
510  *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
511  *     <ul>
512  *       <li>2.4 Canonical Equivalents
513  *       <li>Level 3
514  *     </ul>
515  *   <li>Parsing performance
516  * </ul>
517  *
518  * <hr width="50%">
519  *
520  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
521  */
522 public class RegularExpression implements java.io.Serializable {
523     static final boolean DEBUG = false;
524 
525     /**
526      * Compiles a token tree into an operation flow.
527      */
528     private synchronized void compile(Token tok) {
529         if (this.operations != null)
530             return;
531         this.numberOfClosures = 0;
532         this.operations = this.compile(tok, null, false);
533     }
534 
535     /**
536      * Converts a token to an operation.
537      */
538     private Op compile(Token tok, Op next, boolean reverse) {
539         Op ret;
540         switch (tok.type) {
541         case Token.DOT:
542             ret = Op.createDot();
543             ret.next = next;
544             break;
545 
546         case Token.CHAR:
547             ret = Op.createChar(tok.getChar());
548             ret.next = next;
549             break;
550 
551         case Token.ANCHOR:
552             ret = Op.createAnchor(tok.getChar());
553             ret.next = next;
554             break;
555 
556         case Token.RANGE:
557         case Token.NRANGE:
558             ret = Op.createRange(tok);
559             ret.next = next;
560             break;
561 
562         case Token.CONCAT:
563             ret = next;
564             if (!reverse) {
565                 for (int i = tok.size()-1;  i >= 0;  i --) {
566                     ret = compile(tok.getChild(i), ret, false);
567                 }
568             } else {
569                 for (int i = 0;  i < tok.size();  i ++) {
570                     ret = compile(tok.getChild(i), ret, true);
571                 }
572             }
573             break;
574 
575         case Token.UNION:
576             Op.UnionOp uni = Op.createUnion(tok.size());
577             for (int i = 0;  i < tok.size();  i ++) {
578                 uni.addElement(compile(tok.getChild(i), next, reverse));
579             }
580             ret = uni;                          // ret.next is null.
581             break;
582 
583         case Token.CLOSURE:
584         case Token.NONGREEDYCLOSURE:
585             Token child = tok.getChild(0);
586             int min = tok.getMin();
587             int max = tok.getMax();
588             if (min >= 0 && min == max) { // {n}
589                 ret = next;
590                 for (int i = 0; i < min;  i ++) {
591                     ret = compile(child, ret, reverse);
592                 }
593                 break;
594             }
595             if (min > 0 && max > 0)
596                 max -= min;
597             if (max > 0) {
598                 // X{2,6} -> XX(X(X(XX?)?)?)?
599                 ret = next;
600                 for (int i = 0;  i < max;  i ++) {
601                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
602                     q.next = next;
603                     q.setChild(compile(child, ret, reverse));
604                     ret = q;
605                 }
606             } else {
607                 Op.ChildOp op;
608                 if (tok.type == Token.NONGREEDYCLOSURE) {
609                     op = Op.createNonGreedyClosure();
610                 } else {                        // Token.CLOSURE
611                     if (child.getMinLength() == 0)
612                         op = Op.createClosure(this.numberOfClosures++);
613                     else
614                         op = Op.createClosure(-1);
615                 }
616                 op.next = next;
617                 op.setChild(compile(child, op, reverse));
618                 ret = op;
619             }
620             if (min > 0) {
621                 for (int i = 0;  i < min;  i ++) {
622                     ret = compile(child, ret, reverse);
623                 }
624             }
625             break;
626 
627         case Token.EMPTY:
628             ret = next;
629             break;
630 
631         case Token.STRING:
632             ret = Op.createString(tok.getString());
633             ret.next = next;
634             break;
635 
636         case Token.BACKREFERENCE:
637             ret = Op.createBackReference(tok.getReferenceNumber());
638             ret.next = next;
639             break;
640 
641         case Token.PAREN:
642             if (tok.getParenNumber() == 0) {
643                 ret = compile(tok.getChild(0), next, reverse);
644             } else if (reverse) {
645                 next = Op.createCapture(tok.getParenNumber(), next);
646                 next = compile(tok.getChild(0), next, reverse);
647                 ret = Op.createCapture(-tok.getParenNumber(), next);
648             } else {
649                 next = Op.createCapture(-tok.getParenNumber(), next);
650                 next = compile(tok.getChild(0), next, reverse);
651                 ret = Op.createCapture(tok.getParenNumber(), next);
652             }
653             break;
654 
655         case Token.LOOKAHEAD:
656             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
657             break;
658         case Token.NEGATIVELOOKAHEAD:
659             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
660             break;
661         case Token.LOOKBEHIND:
662             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
663             break;
664         case Token.NEGATIVELOOKBEHIND:
665             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
666             break;
667 
668         case Token.INDEPENDENT:
669             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
670             break;
671 
672         case Token.MODIFIERGROUP:
673             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
674                                     ((Token.ModifierToken)tok).getOptions(),
675                                     ((Token.ModifierToken)tok).getOptionsMask());
676             break;
677 
678         case Token.CONDITION:
679             Token.ConditionToken ctok = (Token.ConditionToken)tok;
680             int ref = ctok.refNumber;
681             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
682             Op yes = compile(ctok.yes, next, reverse);
683             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
684             ret = Op.createCondition(next, ref, condition, yes, no);
685             break;
686 
687         default:
688             throw new RuntimeException("Unknown token type: "+tok.type);
689         } // switch (tok.type)
690         return ret;
691     }
692 
693 
694 //Public
695 
696     /**
697      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
698      *
699      * @return true if the target is matched to this regular expression.
700      */
701     public boolean matches(char[]  target) {
702         return this.matches(target, 0,  target .length , (Match)null);
703     }
704 
705     /**
706      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
707      * in specified range or not.
708      *
709      * @param start Start offset of the range.
710      * @param end  End offset +1 of the range.
711      * @return true if the target is matched to this regular expression.
712      */
713     public boolean matches(char[]  target, int start, int end) {
714         return this.matches(target, start, end, (Match)null);
715     }
716 
717     /**
718      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
719      *
720      * @param match A Match instance for storing matching result.
721      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
722      */
723     public boolean matches(char[]  target, Match match) {
724         return this.matches(target, 0,  target .length , match);
725     }
726 
727 
728     /**
729      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
730      * in specified range or not.
731      *
732      * @param start Start offset of the range.
733      * @param end  End offset +1 of the range.
734      * @param match A Match instance for storing matching result.
735      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
736      */
737     public boolean matches(char[]  target, int start, int end, Match match) {
738 
739         synchronized (this) {
740             if (this.operations == null)
741                 this.prepare();
742             if (this.context == null)
743                 this.context = new Context();
744         }
745         Context con = null;
746         synchronized (this.context) {
747             con = this.context.inuse ? new Context() : this.context;
748             con.reset(target, start, end, this.numberOfClosures);
749         }
750         if (match != null) {
751             match.setNumberOfGroups(this.nofparen);
752             match.setSource(target);
753         } else if (this.hasBackReferences) {
754             match = new Match();
755             match.setNumberOfGroups(this.nofparen);
756             // Need not to call setSource() because
757             // a caller can not access this match instance.
758         }
759         con.match = match;
760 
761         if (this.isSet(this.options, XMLSCHEMA_MODE)) {
762             int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
763             //System.err.println("DEBUG: matchEnd="+matchEnd);
764             if (matchEnd == con.limit) {
765                 if (con.match != null) {
766                     con.match.setBeginning(0, con.start);
767                     con.match.setEnd(0, matchEnd);
768                 }
769                 con.inuse = false;
770                 return true;
771             }
772             return false;
773         }
774 
775         /*
776          * The pattern has only fixed string.
777          * The engine uses Boyer-Moore.
778          */
779         if (this.fixedStringOnly) {
780             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
781             int o = this.fixedStringTable.matches(target, con.start, con.limit);
782             if (o >= 0) {
783                 if (con.match != null) {
784                     con.match.setBeginning(0, o);
785                     con.match.setEnd(0, o+this.fixedString.length());
786                 }
787                 con.inuse = false;
788                 return true;
789             }
790             con.inuse = false;
791             return false;
792         }
793 
794         /*
795          * The pattern contains a fixed string.
796          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
797          * If not, it return with false.
798          */
799         if (this.fixedString != null) {
800             int o = this.fixedStringTable.matches(target, con.start, con.limit);
801             if (o < 0) {
802                 //System.err.println("Non-match in fixed-string search.");
803                 con.inuse = false;
804                 return false;
805             }
806         }
807 
808         int limit = con.limit-this.minlength;
809         int matchStart;
810         int matchEnd = -1;
811 
812         /*
813          * Checks whether the expression starts with ".*".
814          */
815         if (this.operations != null
816             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
817             if (isSet(this.options, SINGLE_LINE)) {
818                 matchStart = con.start;
819                 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
820             } else {
821                 boolean previousIsEOL = true;
822                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
823                     int ch =  target [  matchStart ] ;
824                     if (isEOLChar(ch)) {
825                         previousIsEOL = true;
826                     } else {
827                         if (previousIsEOL) {
828                             if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
829                                                                        matchStart, 1, this.options)))
830                                 break;
831                         }
832                         previousIsEOL = false;
833                     }
834                 }
835             }
836         }
837 
838         /*
839          * Optimization against the first character.
840          */
841         else if (this.firstChar != null) {
842             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
843             RangeToken range = this.firstChar;
844             if (this.isSet(this.options, IGNORE_CASE)) {
845                 range = this.firstChar.getCaseInsensitiveToken();
846                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
847                     int ch =  target [  matchStart ] ;
848                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
849                         ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
850                         if (!range.match(ch))  continue;
851                     } else {
852                         if (!range.match(ch)) {
853                             char ch1 = Character.toUpperCase((char)ch);
854                             if (!range.match(ch1))
855                                 if (!range.match(Character.toLowerCase(ch1)))
856                                     continue;
857                         }
858                     }
859                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
860                                                                matchStart, 1, this.options)))
861                         break;
862                 }
863             } else {
864                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
865                     int ch =  target [  matchStart ] ;
866                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
867                         ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
868                     if (!range.match(ch))  continue;
869                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
870                                                                matchStart, 1, this.options)))
871                         break;
872                 }
873             }
874         }
875 
876         /*
877          * Straightforward matching.
878          */
879         else {
880             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
881                 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
882                     break;
883             }
884         }
885 
886         if (matchEnd >= 0) {
887             if (con.match != null) {
888                 con.match.setBeginning(0, matchStart);
889                 con.match.setEnd(0, matchEnd);
890             }
891             con.inuse = false;
892             return true;
893         } else {
894             con.inuse = false;
895             return false;
896         }
897     }
898 
899 /**
900  * @return -1 when not match; offset of the end of matched string when match.
901  */
902     private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
903 
904         char[] target = con.charTarget;
905 
906 
907         while (true) {
908             if (op == null)
909                 return offset;
910             if (offset > con.limit || offset < con.start)
911                 return -1;
912             switch (op.type) {
913             case Op.CHAR:
914                 if (isSet(opts, IGNORE_CASE)) {
915                     int ch = op.getData();
916                     if (dx > 0) {
917                         if (offset >= con.limit || !matchIgnoreCase(ch,  target [  offset ] ))
918                             return -1;
919                         offset ++;
920                     } else {
921                         int o1 = offset-1;
922                         if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target [  o1 ] ))
923                             return -1;
924                         offset = o1;
925                     }
926                 } else {
927                     int ch = op.getData();
928                     if (dx > 0) {
929                         if (offset >= con.limit || ch !=  target [  offset ] )
930                             return -1;
931                         offset ++;
932                     } else {
933                         int o1 = offset-1;
934                         if (o1 >= con.limit || o1 < 0 || ch !=  target [  o1 ] )
935                             return -1;
936                         offset = o1;
937                     }
938                 }
939                 op = op.next;
940                 break;
941 
942             case Op.DOT:
943                 if (dx > 0) {
944                     if (offset >= con.limit)
945                         return -1;
946                     int ch =  target [  offset ] ;
947                     if (isSet(opts, SINGLE_LINE)) {
948                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
949                             offset ++;
950                     } else {
951                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
952                             ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
953                         if (isEOLChar(ch))
954                             return -1;
955                     }
956                     offset ++;
957                 } else {
958                     int o1 = offset-1;
959                     if (o1 >= con.limit || o1 < 0)
960                         return -1;
961                     int ch =  target [  o1 ] ;
962                     if (isSet(opts, SINGLE_LINE)) {
963                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
964                             o1 --;
965                     } else {
966                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
967                             ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
968                         if (!isEOLChar(ch))
969                             return -1;
970                     }
971                     offset = o1;
972                 }
973                 op = op.next;
974                 break;
975 
976             case Op.RANGE:
977             case Op.NRANGE:
978                 if (dx > 0) {
979                     if (offset >= con.limit)
980                         return -1;
981                     int ch =  target [  offset ] ;
982                     if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
983                         ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
984                     RangeToken tok = op.getToken();
985                     if (isSet(opts, IGNORE_CASE)) {
986                         tok = tok.getCaseInsensitiveToken();
987                         if (!tok.match(ch)) {
988                             if (ch >= 0x10000)  return -1;
989                             char uch;
990                             if (!tok.match(uch = Character.toUpperCase((char)ch))
991                                 && !tok.match(Character.toLowerCase(uch)))
992                                 return -1;
993                         }
994                     } else {
995                         if (!tok.match(ch))  return -1;
996                     }
997                     offset ++;
998                 } else {
999                     int o1 = offset-1;
1000                    if (o1 >= con.limit || o1 < 0)
1001                        return -1;
1002                    int ch =  target [  o1 ] ;
1003                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1004                        ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
1005                    RangeToken tok = op.getToken();
1006                    if (isSet(opts, IGNORE_CASE)) {
1007                        tok = tok.getCaseInsensitiveToken();
1008                        if (!tok.match(ch)) {
1009                            if (ch >= 0x10000)  return -1;
1010                            char uch;
1011                            if (!tok.match(uch = Character.toUpperCase((char)ch))
1012                                && !tok.match(Character.toLowerCase(uch)))
1013                                return -1;
1014                        }
1015                    } else {
1016                        if (!tok.match(ch))  return -1;
1017                    }
1018                    offset = o1;
1019                }
1020                op = op.next;
1021                break;
1022
1023            case Op.ANCHOR:
1024                boolean go = false;
1025                switch (op.getData()) {
1026                case '^':
1027                    if (isSet(opts, MULTIPLE_LINES)) {
1028                        if (!(offset == con.start
1029                              || offset > con.start && isEOLChar( target [  offset-1 ] )))
1030                            return -1;
1031                    } else {
1032                        if (offset != con.start)
1033                            return -1;
1034                    }
1035                    break;
1036
1037                case '@':                         // Internal use only.
1038                    // The @ always matches line beginnings.
1039                    if (!(offset == con.start
1040                          || offset > con.start && isEOLChar( target [  offset-1 ] )))
1041                        return -1;
1042                    break;
1043
1044                case '$':
1045                    if (isSet(opts, MULTIPLE_LINES)) {
1046                        if (!(offset == con.limit
1047                              || offset < con.limit && isEOLChar( target [  offset ] )))
1048                            return -1;
1049                    } else {
1050                        if (!(offset == con.limit
1051                              || offset+1 == con.limit && isEOLChar( target [  offset ] )
1052                              || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
1053                              &&  target [  offset+1 ]  == LINE_FEED))
1054                            return -1;
1055                    }
1056                    break;
1057
1058                case 'A':
1059                    if (offset != con.start)  return -1;
1060                    break;
1061
1062                case 'Z':
1063                    if (!(offset == con.limit
1064                          || offset+1 == con.limit && isEOLChar( target [  offset ] )
1065                          || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
1066                          &&  target [  offset+1 ]  == LINE_FEED))
1067                        return -1;
1068                    break;
1069
1070                case 'z':
1071                    if (offset != con.limit)  return -1;
1072                    break;
1073
1074                case 'b':
1075                    if (con.length == 0)  return -1;
1076                    {
1077                        int after = getWordType(target, con.start, con.limit, offset, opts);
1078                        if (after == WT_IGNORE)  return -1;
1079                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1080                        if (after == before)  return -1;
1081                    }
1082                    break;
1083
1084                case 'B':
1085                    if (con.length == 0)
1086                        go = true;
1087                    else {
1088                        int after = getWordType(target, con.start, con.limit, offset, opts);
1089                        go = after == WT_IGNORE
1090                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1091                    }
1092                    if (!go)  return -1;
1093                    break;
1094
1095                case '<':
1096                    if (con.length == 0 || offset == con.limit)  return -1;
1097                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1098                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1099                        return -1;
1100                    break;
1101
1102                case '>':
1103                    if (con.length == 0 || offset == con.start)  return -1;
1104                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1105                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1106                        return -1;
1107                    break;
1108                } // switch anchor type
1109                op = op.next;
1110                break;
1111
1112            case Op.BACKREFERENCE:
1113                {
1114                    int refno = op.getData();
1115                    if (refno <= 0 || refno >= this.nofparen)
1116                        throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1117                    if (con.match.getBeginning(refno) < 0
1118                        || con.match.getEnd(refno) < 0)
1119                        return -1;                // ********
1120                    int o2 = con.match.getBeginning(refno);
1121                    int literallen = con.match.getEnd(refno)-o2;
1122                    if (!isSet(opts, IGNORE_CASE)) {
1123                        if (dx > 0) {
1124                            if (!regionMatches(target, offset, con.limit, o2, literallen))
1125                                return -1;
1126                            offset += literallen;
1127                        } else {
1128                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1129                                return -1;
1130                            offset -= literallen;
1131                        }
1132                    } else {
1133                        if (dx > 0) {
1134                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1135                                return -1;
1136                            offset += literallen;
1137                        } else {
1138                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1139                                                         o2, literallen))
1140                                return -1;
1141                            offset -= literallen;
1142                        }
1143                    }
1144                }
1145                op = op.next;
1146                break;
1147            case Op.STRING:
1148                {
1149                    String literal = op.getString();
1150                    int literallen = literal.length();
1151                    if (!isSet(opts, IGNORE_CASE)) {
1152                        if (dx > 0) {
1153                            if (!regionMatches(target, offset, con.limit, literal, literallen))
1154                                return -1;
1155                            offset += literallen;
1156                        } else {
1157                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1158                                return -1;
1159                            offset -= literallen;
1160                        }
1161                    } else {
1162                        if (dx > 0) {
1163                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1164                                return -1;
1165                            offset += literallen;
1166                        } else {
1167                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1168                                                         literal, literallen))
1169                                return -1;
1170                            offset -= literallen;
1171                        }
1172                    }
1173                }
1174                op = op.next;
1175                break;
1176
1177            case Op.CLOSURE:
1178                {
1179                    /*
1180                     * Saves current position to avoid
1181                     * zero-width repeats.
1182                     */
1183                    int id = op.getData();
1184                    if (id >= 0) {
1185                        int previousOffset = con.offsets[id];
1186                        if (previousOffset < 0 || previousOffset != offset) {
1187                            con.offsets[id] = offset;
1188                        } else {
1189                            con.offsets[id] = -1;
1190                            op = op.next;
1191                            break;
1192                        }
1193                    }
1194
1195                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1196                    if (id >= 0)  con.offsets[id] = -1;
1197                    if (ret >= 0)  return ret;
1198                    op = op.next;
1199                }
1200                break;
1201
1202            case Op.QUESTION:
1203                {
1204                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1205                    if (ret >= 0)  return ret;
1206                    op = op.next;
1207                }
1208                break;
1209
1210            case Op.NONGREEDYCLOSURE:
1211            case Op.NONGREEDYQUESTION:
1212                {
1213                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1214                    if (ret >= 0)  return ret;
1215                    op = op.getChild();
1216                }
1217                break;
1218
1219            case Op.UNION:
1220                for (int i = 0;  i < op.size();  i ++) {
1221                    int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1222                    //System.err.println("UNION: "+i+", ret="+ret);
1223                    if (ret >= 0)  return ret;
1224                }
1225                return -1;
1226
1227            case Op.CAPTURE:
1228                int refno = op.getData();
1229                if (con.match != null && refno > 0) {
1230                    int save = con.match.getBeginning(refno);
1231                    con.match.setBeginning(refno, offset);
1232                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1233