1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.regexp;
19
20 import java.io.Serializable;
21
22 /**
23 * A class that holds compiled regular expressions. This is exposed mainly
24 * for use by the recompile utility (which helps you produce precompiled
25 * REProgram objects). You should not otherwise need to work directly with
26 * this class.
27 *
28 * @see RE
29 * @see RECompiler
30 *
31 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
32 * @version $Id: REProgram.java 518156 2007-03-14 14:31:26Z vgritsenko $
33 */
34 public class REProgram implements Serializable
35 {
36 static final int OPT_HASBACKREFS = 1;
37 static final int OPT_HASBOL = 2;
38
39 char[] instruction; // The compiled regular expression 'program'
40 int lenInstruction; // The amount of the instruction buffer in use
41 char[] prefix; // Prefix string optimization
42 int flags; // Optimization flags (REProgram.OPT_*)
43 int maxParens = -1;
44
45 /**
46 * Constructs a program object from a character array
47 * @param instruction Character array with RE opcode instructions in it
48 */
49 public REProgram(char[] instruction)
50 {
51 this(instruction, instruction.length);
52 }
53
54 /**
55 * Constructs a program object from a character array
56 * @param parens Count of parens in the program
57 * @param instruction Character array with RE opcode instructions in it
58 */
59 public REProgram(int parens, char[] instruction)
60 {
61 this(instruction, instruction.length);
62 this.maxParens = parens;
63 }
64
65 /**
66 * Constructs a program object from a character array
67 * @param instruction Character array with RE opcode instructions in it
68 * @param lenInstruction Amount of instruction array in use
69 */
70 public REProgram(char[] instruction, int lenInstruction)
71 {
72 setInstructions(instruction, lenInstruction);
73 }
74
75 /**
76 * Returns a copy of the current regular expression program in a character
77 * array that is exactly the right length to hold the program. If there is
78 * no program compiled yet, getInstructions() will return null.
79 * @return A copy of the current compiled RE program
80 */
81 public char[] getInstructions()
82 {
83 // Ensure program has been compiled!
84 if (lenInstruction != 0)
85 {
86 // Return copy of program
87 char[] ret = new char[lenInstruction];
88 System.arraycopy(instruction, 0, ret, 0, lenInstruction);
89 return ret;
90 }
91 return null;
92 }
93
94 /**
95 * Sets a new regular expression program to run. It is this method which
96 * performs any special compile-time search optimizations. Currently only
97 * two optimizations are in place - one which checks for backreferences
98 * (so that they can be lazily allocated) and another which attempts to
99 * find an prefix anchor string so that substantial amounts of input can
100 * potentially be skipped without running the actual program.
101 * @param instruction Program instruction buffer
102 * @param lenInstruction Length of instruction buffer in use
103 */
104 public void setInstructions(char[] instruction, int lenInstruction)
105 {
106 // Save reference to instruction array
107 this.instruction = instruction;
108 this.lenInstruction = lenInstruction;
109
110 // Initialize other program-related variables
111 this.flags = 0;
112 this.prefix = null;
113
114 // Try various compile-time optimizations if there's a program
115 if (instruction != null && lenInstruction != 0)
116 {
117 // If the first node is a branch
118 if (lenInstruction >= RE.nodeSize && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH)
119 {
120 // to the end node
121 int next = (short) instruction[0 + RE.offsetNext];
122 if (instruction[next + RE.offsetOpcode] == RE.OP_END && lenInstruction >= (RE.nodeSize * 2))
123 {
124 final char nextOp = instruction[RE.nodeSize + RE.offsetOpcode];
125 // the branch starts with an atom
126 if (nextOp == RE.OP_ATOM)
127 {
128 // then get that atom as an prefix because there's no other choice
129 int lenAtom = instruction[RE.nodeSize + RE.offsetOpdata];
130 this.prefix = new char[lenAtom];
131 System.arraycopy(instruction, RE.nodeSize * 2, prefix, 0, lenAtom);
132 }
133 // the branch starts with a BOL
134 else if (nextOp == RE.OP_BOL)
135 {
136 // then set the flag indicating that BOL is present
137 this.flags |= OPT_HASBOL;
138 }
139 }
140 }
141
142 BackrefScanLoop:
143
144 // Check for backreferences
145 for (int i = 0; i < lenInstruction; i += RE.nodeSize)
146 {
147 switch (instruction[i + RE.offsetOpcode])
148 {
149 case RE.OP_ANYOF:
150 i += (instruction[i + RE.offsetOpdata] * 2);
151 break;
152
153 case RE.OP_ATOM:
154 i += instruction[i + RE.offsetOpdata];
155 break;
156
157 case RE.OP_BACKREF:
158 flags |= OPT_HASBACKREFS;
159 break BackrefScanLoop;
160 }
161 }
162 }
163 }
164
165 /**
166 * Returns a copy of the prefix of current regular expression program
167 * in a character array. If there is no prefix, or there is no program
168 * compiled yet, <code>getPrefix</code> will return null.
169 * @return A copy of the prefix of current compiled RE program
170 */
171 public char[] getPrefix()
172 {
173 if (prefix != null)
174 {
175 // Return copy of prefix
176 char[] ret = new char[prefix.length];
177 System.arraycopy(prefix, 0, ret, 0, prefix.length);
178 return ret;
179 }
180 return null;
181 }
182 }