1
2 /* ====================================================================
3 Licensed to the Apache Software Foundation (ASF) under one or more
4 contributor license agreements. See the NOTICE file distributed with
5 this work for additional information regarding copyright ownership.
6 The ASF licenses this file to You under the Apache License, Version 2.0
7 (the "License"); you may not use this file except in compliance with
8 the License. You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 ==================================================================== */
18
19 package org.apache.poi.util;
20
21 import java.io.UnsupportedEncodingException;
22 import java.text.FieldPosition;
23 import java.text.NumberFormat;
24 /**
25 * Title: String Utility Description: Collection of string handling utilities
26 *
27 *
28 *@author Andrew C. Oliver
29 *@author Sergei Kozello (sergeikozello at mail.ru)
30 *@author Toshiaki Kamoshida (kamoshida.toshiaki at future dot co dot jp)
31 *@since May 10, 2002
32 *@version 1.0
33 */
34 public class StringUtil {
35 private final static String ENCODING = "ISO-8859-1";
36 /**
37 * Constructor for the StringUtil object
38 */
39 private StringUtil() {
40 }
41
42 /**
43 * Given a byte array of 16-bit unicode characters in Little Endian
44 * format (most important byte last), return a Java String representation
45 * of it.
46 *
47 * { 0x16, 0x00 } -0x16
48 *
49 * @param string the byte array to be converted
50 * @param offset the initial offset into the
51 * byte array. it is assumed that string[ offset ] and string[ offset +
52 * 1 ] contain the first 16-bit unicode character
53 * @param len the length of the final string
54 * @return the converted string
55 * @exception ArrayIndexOutOfBoundsException if offset is out of bounds for
56 * the byte array (i.e., is negative or is greater than or equal to
57 * string.length)
58 * @exception IllegalArgumentException if len is too large (i.e.,
59 * there is not enough data in string to create a String of that
60 * length)
61 */
62 public static String getFromUnicodeLE(
63 final byte[] string,
64 final int offset,
65 final int len)
66 throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
67 if ((offset < 0) || (offset >= string.length)) {
68 throw new ArrayIndexOutOfBoundsException("Illegal offset");
69 }
70 if ((len < 0) || (((string.length - offset) / 2) < len)) {
71 throw new IllegalArgumentException("Illegal length");
72 }
73
74 try {
75 return new String(string, offset, len * 2, "UTF-16LE");
76 } catch (UnsupportedEncodingException e) {
77 throw new InternalError(); /*unreachable*/
78 }
79 }
80
81 /**
82 * Given a byte array of 16-bit unicode characters in little endian
83 * format (most important byte last), return a Java String representation
84 * of it.
85 *
86 * { 0x16, 0x00 } -0x16
87 *
88 *@param string the byte array to be converted
89 *@return the converted string
90 */
91 public static String getFromUnicodeLE(final byte[] string) {
92 if(string.length == 0) { return ""; }
93 return getFromUnicodeLE(string, 0, string.length / 2);
94 }
95
96 /**
97 * Given a byte array of 16-bit unicode characters in big endian
98 * format (most important byte first), return a Java String representation
99 * of it.
100 *
101 * { 0x00, 0x16 } -0x16
102 *
103 *@param string the byte array to be converted
104 **@param offset the initial offset into the
105 * byte array. it is assumed that string[ offset ] and string[ offset +
106 * 1 ] contain the first 16-bit unicode character
107 *@param len the length of the final string
108 *@return the converted string
109 *@exception ArrayIndexOutOfBoundsException if offset is out of bounds for
110 * the byte array (i.e., is negative or is greater than or equal to
111 * string.length)
112 *@exception IllegalArgumentException if len is too large (i.e.,
113 * there is not enough data in string to create a String of that
114 * length)
115 */
116 public static String getFromUnicodeBE(
117 final byte[] string,
118 final int offset,
119 final int len)
120 throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
121 if ((offset < 0) || (offset >= string.length)) {
122 throw new ArrayIndexOutOfBoundsException("Illegal offset");
123 }
124 if ((len < 0) || (((string.length - offset) / 2) < len)) {
125 throw new IllegalArgumentException("Illegal length");
126 }
127 try {
128 return new String(string, offset, len * 2, "UTF-16BE");
129 } catch (UnsupportedEncodingException e) {
130 throw new InternalError(); /*unreachable*/
131 }
132 }
133
134 /**
135 * Given a byte array of 16-bit unicode characters in big endian
136 * format (most important byte first), return a Java String representation
137 * of it.
138 *
139 * { 0x00, 0x16 } -0x16
140 *
141 *@param string the byte array to be converted
142 *@return the converted string
143 */
144 public static String getFromUnicodeBE(final byte[] string) {
145 if(string.length == 0) { return ""; }
146 return getFromUnicodeBE(string, 0, string.length / 2);
147 }
148
149 /**
150 * Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
151 * String and return.
152 * (In Excel terms, read compressed 8 bit unicode as a string)
153 *
154 * @param string byte array to read
155 * @param offset offset to read byte array
156 * @param len length to read byte array
157 * @return String generated String instance by reading byte array
158 */
159 public static String getFromCompressedUnicode(
160 final byte[] string,
161 final int offset,
162 final int len) {
163 try {
164 int len_to_use = Math.min(len, string.length - offset);
165 return new String(string, offset, len_to_use, "ISO-8859-1");
166 } catch (UnsupportedEncodingException e) {
167 throw new InternalError(); /* unreachable */
168 }
169 }
170
171 /**
172 * Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
173 * codepage).
174 * (In Excel terms, write compressed 8 bit unicode)
175 *
176 *@param input the String containing the data to be written
177 *@param output the byte array to which the data is to be written
178 *@param offset an offset into the byte arrat at which the data is start
179 * when written
180 */
181 public static void putCompressedUnicode(
182 final String input,
183 final byte[] output,
184 final int offset) {
185 try {
186 byte[] bytes = input.getBytes("ISO-8859-1");
187 System.arraycopy(bytes, 0, output, offset, bytes.length);
188 } catch (UnsupportedEncodingException e) {
189 throw new InternalError(); /*unreachable*/
190 }
191 }
192
193 /**
194 * Takes a unicode string, and returns it as little endian (most
195 * important byte last) bytes in the supplied byte array.
196 * (In Excel terms, write uncompressed unicode)
197 *
198 *@param input the String containing the unicode data to be written
199 *@param output the byte array to hold the uncompressed unicode, should be twice the length of the String
200 *@param offset the offset to start writing into the byte array
201 */
202 public static void putUnicodeLE(
203 final String input,
204 final byte[] output,
205 final int offset) {
206 try {
207 byte[] bytes = input.getBytes("UTF-16LE");
208 System.arraycopy(bytes, 0, output, offset, bytes.length);
209 } catch (UnsupportedEncodingException e) {
210 throw new InternalError(); /*unreachable*/
211 }
212 }
213
214 /**
215 * Takes a unicode string, and returns it as big endian (most
216 * important byte first) bytes in the supplied byte array.
217 * (In Excel terms, write uncompressed unicode)
218 *
219 *@param input the String containing the unicode data to be written
220 *@param output the byte array to hold the uncompressed unicode, should be twice the length of the String
221 *@param offset the offset to start writing into the byte array
222 */
223 public static void putUnicodeBE(
224 final String input,
225 final byte[] output,
226 final int offset) {
227 try {
228 byte[] bytes = input.getBytes("UTF-16BE");
229 System.arraycopy(bytes, 0, output, offset, bytes.length);
230 } catch (UnsupportedEncodingException e) {
231 throw new InternalError(); /*unreachable*/
232 }
233 }
234
235 /**
236 * Apply printf() like formatting to a string.
237 * Primarily used for logging.
238 *@param message the string with embedded formatting info
239 * eg. "This is a test %2.2"
240 *@param params array of values to format into the string
241 *@return The formatted string
242 */
243 public static String format(String message, Object[] params) {
244 int currentParamNumber = 0;
245 StringBuffer formattedMessage = new StringBuffer();
246 for (int i = 0; i < message.length(); i++) {
247 if (message.charAt(i) == '%') {
248 if (currentParamNumber >= params.length) {
249 formattedMessage.append("?missing data?");
250 } else if (
251 (params[currentParamNumber] instanceof Number)
252 && (i + 1 < message.length())) {
253 i
254 += matchOptionalFormatting(
255 (Number) params[currentParamNumber++],
256 message.substring(i + 1),
257 formattedMessage);
258 } else {
259 formattedMessage.append(
260 params[currentParamNumber++].toString());
261 }
262 } else {
263 if ((message.charAt(i) == '\\')
264 && (i + 1 < message.length())
265 && (message.charAt(i + 1) == '%')) {
266 formattedMessage.append('%');
267 i++;
268 } else {
269 formattedMessage.append(message.charAt(i));
270 }
271 }
272 }
273 return formattedMessage.toString();
274 }
275
276
277 private static int matchOptionalFormatting(
278 Number number,
279 String formatting,
280 StringBuffer outputTo) {
281 NumberFormat numberFormat = NumberFormat.getInstance();
282 if ((0 < formatting.length())
283 && Character.isDigit(formatting.charAt(0))) {
284 numberFormat.setMinimumIntegerDigits(
285 Integer.parseInt(formatting.charAt(0) + ""));
286 if ((2 < formatting.length())
287 && (formatting.charAt(1) == '.')
288 && Character.isDigit(formatting.charAt(2))) {
289 numberFormat.setMaximumFractionDigits(
290 Integer.parseInt(formatting.charAt(2) + ""));
291 numberFormat.format(number, outputTo, new FieldPosition(0));
292 return 3;
293 }
294 numberFormat.format(number, outputTo, new FieldPosition(0));
295 return 1;
296 } else if (
297 (0 < formatting.length()) && (formatting.charAt(0) == '.')) {
298 if ((1 < formatting.length())
299 && Character.isDigit(formatting.charAt(1))) {
300 numberFormat.setMaximumFractionDigits(
301 Integer.parseInt(formatting.charAt(1) + ""));
302 numberFormat.format(number, outputTo, new FieldPosition(0));
303 return 2;
304 }
305 }
306 numberFormat.format(number, outputTo, new FieldPosition(0));
307 return 1;
308 }
309
310 /**
311 * @return the encoding we want to use, currently hardcoded to ISO-8859-1
312 */
313 public static String getPreferredEncoding() {
314 return ENCODING;
315 }
316
317 /**
318 * check the parameter has multibyte character
319 *
320 * @param value string to check
321 * @return boolean result
322 * true:string has at least one multibyte character
323 */
324 public static boolean hasMultibyte(String value){
325 if( value == null )return false;
326 for(int i = 0 ; i < value.length() ; i++ ){
327 char c = value.charAt(i);
328 if(c > 0xFF )return true;
329 }
330 return false;
331 }
332
333 /**
334 * Checks to see if a given String needs to be represented as Unicode
335 * @param value
336 * @return true if string needs Unicode to be represented.
337 */
338 public static boolean isUnicodeString(final String value) {
339 try {
340 return !value.equals(new String(value.getBytes("ISO-8859-1"), "ISO-8859-1"));
341 } catch (UnsupportedEncodingException e) {
342 return true;
343 }
344 }
345 }