public Token next() {
// cache loop instance vars (performance)
String s = str;
int len = s.length();
int i = pos;
boolean letter = isLetter;
int start = 0;
String text;
do {
// find beginning of token
text = null;
while (i < len && !isTokenChar(s.charAt(i), letter)) {
i++;
}
if (i < len) { // found beginning; now find end of token
start = i;
while (i < len && isTokenChar(s.charAt(i), letter)) {
i++;
}
text = s.substring(start, i);
if (toLowerCase) text = text.toLowerCase(locale);
// if (toLowerCase) {
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
// text = s.substring(start, i).toLowerCase();
//// char[] chars = new char[i-start];
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
//// text = new String(chars);
// } else {
// text = s.substring(start, i);
// }
}
} while (text != null && isStopWord(text));
pos = i;
return text != null ? new Token(text, start, i) : null;
}
|