Source code: com/mysql/jdbc/StringUtils.java
1 /*
2 Copyright (C) 2002-2004 MySQL AB
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of version 2 of the GNU General Public License as
6 published by the Free Software Foundation.
7
8
9 There are special exceptions to the terms and conditions of the GPL
10 as it is applied to this software. View the full text of the
11 exception exception in file EXCEPTIONS-CONNECTOR-J in the directory of this
12 software distribution.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
23 */
24 package com.mysql.jdbc;
25
26 import java.io.ByteArrayOutputStream;
27 import java.io.UnsupportedEncodingException;
28
29 import java.util.ArrayList;
30 import java.util.List;
31 import java.util.StringTokenizer;
32
33
34 /**
35 * Various utility methods for converting to/from byte arrays in the platform
36 * encoding
37 *
38 * @author Mark Matthews
39 */
40 public class StringUtils {
41 private static final int BYTE_RANGE = (1 + Byte.MAX_VALUE) - Byte.MIN_VALUE;
42 private static byte[] allBytes = new byte[BYTE_RANGE];
43 private static char[] byteToChars = new char[BYTE_RANGE];
44
45 static {
46 for (int i = Byte.MIN_VALUE; i <= Byte.MAX_VALUE; i++) {
47 allBytes[i - Byte.MIN_VALUE] = (byte) i;
48 }
49
50 String allBytesString = new String(allBytes, 0,
51 Byte.MAX_VALUE - Byte.MIN_VALUE);
52
53 int allBytesStringLen = allBytesString.length();
54
55 for (int i = 0;
56 (i < (Byte.MAX_VALUE - Byte.MIN_VALUE))
57 && (i < allBytesStringLen); i++) {
58 byteToChars[i] = allBytesString.charAt(i);
59 }
60 }
61
62 /**
63 * Returns the byte[] representation of the given string using given
64 * encoding.
65 *
66 * @param s the string to convert
67 * @param encoding the character encoding to use
68 * @param serverEncoding DOCUMENT ME!
69 * @param parserKnowsUnicode DOCUMENT ME!
70 *
71 * @return byte[] representation of the string
72 *
73 * @throws UnsupportedEncodingException if an encoding unsupported by the
74 * JVM is supplied.
75 */
76 public static final byte[] getBytes(String s, String encoding,
77 String serverEncoding, boolean parserKnowsUnicode)
78 throws UnsupportedEncodingException {
79 SingleByteCharsetConverter converter = SingleByteCharsetConverter
80 .getInstance(encoding);
81
82 return getBytes(s, converter, encoding, serverEncoding,
83 parserKnowsUnicode);
84 }
85
86 /**
87 * Returns the byte[] representation of the given string (re)using the
88 * given charset converter, and the given encoding.
89 *
90 * @param s the string to convert
91 * @param converter the converter to reuse
92 * @param encoding the character encoding to use
93 * @param serverEncoding DOCUMENT ME!
94 * @param parserKnowsUnicode DOCUMENT ME!
95 *
96 * @return byte[] representation of the string
97 *
98 * @throws UnsupportedEncodingException if an encoding unsupported by the
99 * JVM is supplied.
100 */
101 public static final byte[] getBytes(String s,
102 SingleByteCharsetConverter converter, String encoding,
103 String serverEncoding, boolean parserKnowsUnicode)
104 throws UnsupportedEncodingException {
105 byte[] b = null;
106
107 if (converter != null) {
108 b = converter.toBytes(s);
109 } else if (encoding == null) {
110 b = s.getBytes();
111 } else {
112 b = s.getBytes(encoding);
113
114 if (!parserKnowsUnicode
115 && (encoding.equalsIgnoreCase("SJIS")
116 || encoding.equalsIgnoreCase("BIG5")
117 || encoding.equalsIgnoreCase("GBK"))) {
118 if (!encoding.equalsIgnoreCase(serverEncoding)) {
119 b = escapeEasternUnicodeByteStream(b, s, 0, s.length());
120 }
121 }
122 }
123
124 return b;
125 }
126
127 /**
128 * DOCUMENT ME!
129 *
130 * @param s DOCUMENT ME!
131 * @param converter DOCUMENT ME!
132 * @param encoding DOCUMENT ME!
133 * @param serverEncoding DOCUMENT ME!
134 * @param offset DOCUMENT ME!
135 * @param length DOCUMENT ME!
136 * @param parserKnowsUnicode DOCUMENT ME!
137 *
138 * @return DOCUMENT ME!
139 *
140 * @throws UnsupportedEncodingException DOCUMENT ME!
141 */
142 public static final byte[] getBytes(String s,
143 SingleByteCharsetConverter converter, String encoding,
144 String serverEncoding, int offset, int length,
145 boolean parserKnowsUnicode) throws UnsupportedEncodingException {
146 byte[] b = null;
147
148 if (converter != null) {
149 b = converter.toBytes(s, offset, length);
150 } else if (encoding == null) {
151 byte[] temp = s.getBytes();
152
153 b = new byte[length];
154 System.arraycopy(temp, offset, b, 0, length);
155 } else {
156 byte[] temp = s.getBytes(encoding);
157
158 b = new byte[length];
159 System.arraycopy(temp, offset, b, 0, length);
160
161 if (!parserKnowsUnicode
162 && (encoding.equalsIgnoreCase("SJIS")
163 || encoding.equalsIgnoreCase("BIG5")
164 || encoding.equalsIgnoreCase("GBK"))) {
165 if (!encoding.equalsIgnoreCase(serverEncoding)) {
166 b = escapeEasternUnicodeByteStream(b, s, offset, length);
167 }
168 }
169 }
170
171 return b;
172 }
173
174 /**
175 * Dumps the given bytes to STDOUT as a hex dump (up to length bytes).
176 *
177 * @param byteBuffer the data to print as hex
178 * @param length the number of bytes to print
179 */
180 public static final void dumpAsHex(byte[] byteBuffer, int length) {
181 int p = 0;
182 int rows = length / 8;
183
184 for (int i = 0; i < rows; i++) {
185 int ptemp = p;
186
187 for (int j = 0; j < 8; j++) {
188 String hexVal = Integer.toHexString((int) byteBuffer[ptemp]
189 & 0xff);
190
191 if (hexVal.length() == 1) {
192 hexVal = "0" + hexVal;
193 }
194
195 System.out.print(hexVal + " ");
196 ptemp++;
197 }
198
199 System.out.print(" ");
200
201 for (int j = 0; j < 8; j++) {
202 if ((byteBuffer[p] > 32) && (byteBuffer[p] < 127)) {
203 System.out.print((char) byteBuffer[p] + " ");
204 } else {
205 System.out.print(". ");
206 }
207
208 p++;
209 }
210
211 System.out.println();
212 }
213
214 int n = 0;
215
216 for (int i = p; i < length; i++) {
217 String hexVal = Integer.toHexString((int) byteBuffer[i] & 0xff);
218
219 if (hexVal.length() == 1) {
220 hexVal = "0" + hexVal;
221 }
222
223 System.out.print(hexVal + " ");
224 n++;
225 }
226
227 for (int i = n; i < 8; i++) {
228 System.out.print(" ");
229 }
230
231 System.out.print(" ");
232
233 for (int i = p; i < length; i++) {
234 if ((byteBuffer[i] > 32) && (byteBuffer[i] < 127)) {
235 System.out.print((char) byteBuffer[i] + " ");
236 } else {
237 System.out.print(". ");
238 }
239 }
240
241 System.out.println();
242 }
243
244 /**
245 * Returns the bytes as an ASCII String.
246 *
247 * @param buffer the bytes representing the string
248 *
249 * @return The ASCII String.
250 */
251 public static final String toAsciiString(byte[] buffer) {
252 return toAsciiString(buffer, 0, buffer.length);
253 }
254
255 /**
256 * Returns the bytes as an ASCII String.
257 *
258 * @param buffer the bytes to convert
259 * @param startPos the position to start converting
260 * @param length the length of the string to convert
261 *
262 * @return the ASCII string
263 */
264 public static final String toAsciiString(byte[] buffer, int startPos,
265 int length) {
266 char[] charArray = new char[length];
267 int readpoint = startPos;
268
269 for (int i = 0; i < length; i++) {
270 charArray[i] = (char) buffer[readpoint];
271 readpoint++;
272 }
273
274 return new String(charArray);
275 }
276
277 /**
278 * Unfortunately, SJIS has 0x5c as a high byte in some of its double-byte
279 * characters, so we need to escape it.
280 *
281 * @param origBytes the original bytes in SJIS format
282 * @param origString the string that had .getBytes() called on it
283 * @param offset where to start converting from
284 * @param length how many characters to convert.
285 *
286 * @return byte[] with 0x5c escaped
287 */
288 public static byte[] escapeEasternUnicodeByteStream(byte[] origBytes,
289 String origString, int offset, int length) {
290 if ((origBytes == null) || (origBytes.length == 0)) {
291 return origBytes;
292 }
293
294 int bytesLen = origBytes.length;
295 int bufIndex = 0;
296 int strIndex = 0;
297
298 ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(bytesLen);
299
300 while (true) {
301 if (origString.charAt(strIndex) == '\\') {
302 // write it out as-is
303 bytesOut.write(origBytes[bufIndex++]);
304
305 //bytesOut.write(origBytes[bufIndex++]);
306 } else {
307 // Grab the first byte
308 int loByte = (int) origBytes[bufIndex];
309
310 if (loByte < 0) {
311 loByte += 256; // adjust for signedness/wrap-around
312 }
313
314 // We always write the first byte
315 bytesOut.write(loByte);
316
317 //
318 // The codepage characters in question exist between
319 // 0x81-0x9F and 0xE0-0xFC...
320 //
321 // See:
322 //
323 // http://www.microsoft.com/GLOBALDEV/Reference/dbcs/932.htm
324 //
325 // Problematic characters in GBK
326 //
327 // U+905C : CJK UNIFIED IDEOGRAPH
328 //
329 // Problematic characters in Big5
330 //
331 // B9F0 = U+5C62 : CJK UNIFIED IDEOGRAPH
332 //
333 if (loByte >= 0x80) {
334 if (bufIndex < (bytesLen - 1)) {
335 int hiByte = (int) origBytes[bufIndex + 1];
336
337 if (hiByte < 0) {
338 hiByte += 256; // adjust for signedness/wrap-around
339 }
340
341 // write the high byte here, and increment the index
342 // for the high byte
343 bytesOut.write(hiByte);
344 bufIndex++;
345
346 // escape 0x5c if necessary
347 if (hiByte == 0x5C) {
348 bytesOut.write(hiByte);
349 }
350 }
351 } else if (loByte == 0x5c) {
352 if (bufIndex < (bytesLen - 1)) {
353 int hiByte = (int) origBytes[bufIndex + 1];
354
355 if (hiByte < 0) {
356 hiByte += 256; // adjust for signedness/wrap-around
357 }
358
359 if (hiByte == 0x62) {
360 // we need to escape the 0x5c
361 bytesOut.write(0x5c);
362 bytesOut.write(0x62);
363 bufIndex++;
364 }
365 }
366 }
367
368 bufIndex++;
369 }
370
371 if (bufIndex >= bytesLen) {
372 // we're done
373 break;
374 }
375
376 strIndex++;
377 }
378
379 return bytesOut.toByteArray();
380 }
381
382 /**
383 * Returns the first non whitespace char, converted to upper case
384 *
385 * @param searchIn the string to search in
386 *
387 * @return the first non-whitespace character, upper cased.
388 */
389 public static char firstNonWsCharUc(String searchIn) {
390 if (searchIn == null) {
391 return 0;
392 }
393
394 int length = searchIn.length();
395
396 for (int i = 0; i < length; i++) {
397 char c = searchIn.charAt(i);
398
399 if (!Character.isWhitespace(c)) {
400 return Character.toUpperCase(c);
401 }
402 }
403
404 return 0;
405 }
406
407 /**
408 * Splits stringToSplit into a list, using the given delimitter
409 *
410 * @param stringToSplit the string to split
411 * @param delimitter the string to split on
412 * @param trim should the split strings be whitespace trimmed?
413 *
414 * @return the list of strings, split by delimitter
415 *
416 * @throws IllegalArgumentException DOCUMENT ME!
417 */
418 public static final List split(String stringToSplit, String delimitter,
419 boolean trim) {
420 if (stringToSplit == null) {
421 return new ArrayList();
422 }
423
424 if (delimitter == null) {
425 throw new IllegalArgumentException();
426 }
427
428 StringTokenizer tokenizer = new StringTokenizer(stringToSplit,
429 delimitter, false);
430
431 List splitTokens = new ArrayList(tokenizer.countTokens());
432
433 while (tokenizer.hasMoreTokens()) {
434 String token = tokenizer.nextToken();
435
436 if (trim) {
437 token = token.trim();
438 }
439
440 splitTokens.add(token);
441 }
442
443 return splitTokens;
444 }
445
446 /**
447 * Determines whether or not the string 'searchIn' contains the string
448 * 'searchFor', dis-regarding case. Shorthand for a
449 * String.regionMatch(...)
450 *
451 * @param searchIn the string to search in
452 * @param searchFor the string to search for
453 *
454 * @return whether searchIn starts with searchFor, ignoring case
455 */
456 public static boolean startsWithIgnoreCase(String searchIn, String searchFor) {
457 return startsWithIgnoreCase(searchIn, 0, searchFor);
458 }
459
460 /**
461 * Determines whether or not the string 'searchIn' contains the string
462 * 'searchFor', dis-regarding case starting at 'startAt' Shorthand for a
463 * String.regionMatch(...)
464 *
465 * @param searchIn the string to search in
466 * @param startAt the position to start at
467 * @param searchFor the string to search for
468 *
469 * @return whether searchIn starts with searchFor, ignoring case
470 */
471 public static boolean startsWithIgnoreCase(String searchIn, int startAt,
472 String searchFor) {
473 return searchIn.regionMatches(true, 0, searchFor, startAt,
474 searchFor.length());
475 }
476
477 /**
478 * Determines whether or not the sting 'searchIn' contains the string
479 * 'searchFor', di-regarding case and leading whitespace
480 *
481 * @param searchIn the string to search in
482 * @param searchFor the string to search for
483 *
484 * @return true if the string starts with 'searchFor' ignoring whitespace
485 */
486 public static boolean startsWithIgnoreCaseAndWs(String searchIn,
487 String searchFor) {
488 int beginPos = 0;
489
490 int inLength = searchIn.length();
491
492 for (beginPos = 0; beginPos < inLength; beginPos++) {
493 if (!Character.isWhitespace(searchIn.charAt(beginPos))) {
494 break;
495 }
496 }
497
498 return startsWithIgnoreCase(searchIn, beginPos, searchFor);
499 }
500 }