org.apache.lucene.analysis
public class: ISOLatin1AccentFilter [javadoc |
source]
java.lang.Object
org.apache.lucene.analysis.TokenStream
org.apache.lucene.analysis.TokenFilter
org.apache.lucene.analysis.ISOLatin1AccentFilter
A filter that replaces accented characters in the ISO Latin 1 character set
(ISO-8859-1) by their unaccented equivalent. The case will not be altered.
For instance, 'à' will be replaced by 'a'.
| Method from org.apache.lucene.analysis.ISOLatin1AccentFilter Summary: |
|---|
|
next, removeAccents |
| Methods from org.apache.lucene.analysis.TokenFilter: |
|---|
|
close, reset |
| Method from org.apache.lucene.analysis.ISOLatin1AccentFilter Detail: |
public final Token next(Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
final char[] buffer = nextToken.termBuffer();
final int length = nextToken.termLength();
// If no characters actually require rewriting then we
// just return token as-is:
for(int i=0;i< length;i++) {
final char c = buffer[i];
if (c >= '\u00c0" && c < = '\uFB06") {
removeAccents(buffer, length);
nextToken.setTermBuffer(output, 0, outputPos);
break;
}
}
return nextToken;
} else
return null;
}
|
public final void removeAccents(char[] input,
int length) {
// Worst-case length required:
final int maxSizeNeeded = 2*length;
int size = output.length;
while (size < maxSizeNeeded)
size *= 2;
if (size != output.length)
output = new char[size];
outputPos = 0;
int pos = 0;
for (int i=0; i< length; i++, pos++) {
final char c = input[pos];
// Quick test: if it's not in range then just keep
// current character
if (c < '\u00c0" || c > '\uFB06")
output[outputPos++] = c;
else {
switch (c) {
case '\u00C0" : // À
case '\u00C1" : // Á
case '\u00C2" : // Â
case '\u00C3" : // Ã
case '\u00C4" : // Ä
case '\u00C5" : // Å
output[outputPos++] = 'A";
break;
case '\u00C6" : // Æ
output[outputPos++] = 'A";
output[outputPos++] = 'E";
break;
case '\u00C7" : // Ç
output[outputPos++] = 'C";
break;
case '\u00C8" : // È
case '\u00C9" : // É
case '\u00CA" : // Ê
case '\u00CB" : // Ë
output[outputPos++] = 'E";
break;
case '\u00CC" : // Ì
case '\u00CD" : // Í
case '\u00CE" : // Î
case '\u00CF" : // Ï
output[outputPos++] = 'I";
break;
case '\u0132" : // IJ
output[outputPos++] = 'I";
output[outputPos++] = 'J";
break;
case '\u00D0" : // Ð
output[outputPos++] = 'D";
break;
case '\u00D1" : // Ñ
output[outputPos++] = 'N";
break;
case '\u00D2" : // Ò
case '\u00D3" : // Ó
case '\u00D4" : // Ô
case '\u00D5" : // Õ
case '\u00D6" : // Ö
case '\u00D8" : // Ø
output[outputPos++] = 'O";
break;
case '\u0152" : // Œ
output[outputPos++] = 'O";
output[outputPos++] = 'E";
break;
case '\u00DE" : // Þ
output[outputPos++] = 'T";
output[outputPos++] = 'H";
break;
case '\u00D9" : // Ù
case '\u00DA" : // Ú
case '\u00DB" : // Û
case '\u00DC" : // Ü
output[outputPos++] = 'U";
break;
case '\u00DD" : // Ý
case '\u0178" : // Ÿ
output[outputPos++] = 'Y";
break;
case '\u00E0" : // à
case '\u00E1" : // á
case '\u00E2" : // â
case '\u00E3" : // ã
case '\u00E4" : // ä
case '\u00E5" : // å
output[outputPos++] = 'a";
break;
case '\u00E6" : // æ
output[outputPos++] = 'a";
output[outputPos++] = 'e";
break;
case '\u00E7" : // ç
output[outputPos++] = 'c";
break;
case '\u00E8" : // è
case '\u00E9" : // é
case '\u00EA" : // ê
case '\u00EB" : // ë
output[outputPos++] = 'e";
break;
case '\u00EC" : // ì
case '\u00ED" : // í
case '\u00EE" : // î
case '\u00EF" : // ï
output[outputPos++] = 'i";
break;
case '\u0133" : // ij
output[outputPos++] = 'i";
output[outputPos++] = 'j";
break;
case '\u00F0" : // ð
output[outputPos++] = 'd";
break;
case '\u00F1" : // ñ
output[outputPos++] = 'n";
break;
case '\u00F2" : // ò
case '\u00F3" : // ó
case '\u00F4" : // ô
case '\u00F5" : // õ
case '\u00F6" : // ö
case '\u00F8" : // ø
output[outputPos++] = 'o";
break;
case '\u0153" : // œ
output[outputPos++] = 'o";
output[outputPos++] = 'e";
break;
case '\u00DF" : // ß
output[outputPos++] = 's";
output[outputPos++] = 's";
break;
case '\u00FE" : // þ
output[outputPos++] = 't";
output[outputPos++] = 'h";
break;
case '\u00F9" : // ù
case '\u00FA" : // ú
case '\u00FB" : // û
case '\u00FC" : // ü
output[outputPos++] = 'u";
break;
case '\u00FD" : // ý
case '\u00FF" : // ÿ
output[outputPos++] = 'y";
break;
case '\uFB00": // ff
output[outputPos++] = 'f";
output[outputPos++] = 'f";
break;
case '\uFB01": // fi
output[outputPos++] = 'f";
output[outputPos++] = 'i";
break;
case '\uFB02": // fl
output[outputPos++] = 'f";
output[outputPos++] = 'l";
break;
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
// case '\uFB03': // ffi
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'i';
// break;
// case '\uFB04': // ffl
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'l';
// break;
case '\uFB05": // ſt
output[outputPos++] = 'f";
output[outputPos++] = 't";
break;
case '\uFB06": // st
output[outputPos++] = 's";
output[outputPos++] = 't";
break;
default :
output[outputPos++] = c;
break;
}
}
}
}
To replace accented characters in a String by unaccented equivalents. |