| Method from org.apache.nutch.analysis.lang.NGramProfile Detail: |
public void add(Token t) {
add(new StringBuffer().append(SEPARATOR)
.append(t.term())
.append(SEPARATOR));
}
Add ngrams from a token to this profile |
public void add(StringBuffer word) {
for (int i=minLength; (i < = maxLength) && (i < word.length()); i++) {
add(word, i);
}
}
Add ngrams from a single word to this profile |
public void analyze(StringBuilder text) {
if (ngrams != null) {
ngrams.clear();
sorted = null;
ngramcounts = null;
}
word.clear().append(SEPARATOR);
for (int i = 0; i < text.length(); i++) {
char c = Character.toLowerCase(text.charAt(i));
if (Character.isLetter(c)) {
add(word.append(c));
} else {
//found word boundary
if (word.length() > 1) {
//we have a word!
add(word.append(SEPARATOR));
word.clear().append(SEPARATOR);
}
}
}
if (word.length() > 1) {
//we have a word!
add(word.append(SEPARATOR));
}
normalize();
}
|
public static NGramProfile create(String name,
InputStream is,
String encoding) {
NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
ABSOLUTE_MAX_NGRAM_LENGTH);
BufferedInputStream bis = new BufferedInputStream(is);
byte buffer[] = new byte[4096];
StringBuilder text = new StringBuilder();
int len;
try {
while ((len = bis.read(buffer)) != -1) {
text.append(new String(buffer, 0, len, encoding));
}
} catch (IOException e) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
newProfile.analyze(text);
return newProfile;
}
Create a new Language profile from (preferably quite large) text file |
public String getName() {
return name;
}
|
public float getSimilarity(NGramProfile another) {
float sum = 0;
try {
Iterator< NGramEntry > i = another.getSorted().iterator();
while (i.hasNext()) {
NGramEntry other = i.next();
if (ngrams.containsKey(other.seq)) {
sum += Math.abs((other.frequency -
ngrams.get(other.seq).frequency)) / 2;
} else {
sum += other.frequency;
}
}
i = getSorted().iterator();
while (i.hasNext()) {
NGramEntry other = i.next();
if (another.ngrams.containsKey(other.seq)) {
sum += Math.abs((other.frequency -
another.ngrams.get(other.seq).frequency)) / 2;
} else {
sum += other.frequency;
}
}
} catch (Exception e) {
if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
}
return sum;
}
Calculate a score how well NGramProfiles match each other |
public List getSorted() {
// make sure sorting is done only once
if (sorted == null) {
sorted = new ArrayList< NGramEntry >(ngrams.values());
Collections.sort(sorted);
// trim at NGRAM_LENGTH entries
if (sorted.size() > MAX_SIZE) {
sorted = sorted.subList(0, MAX_SIZE);
}
}
return sorted;
}
Return a sorted list of ngrams (sort done by 1. frequency 2. sequence) |
public void load(InputStream is) throws IOException {
ngrams.clear();
ngramcounts = new int[maxLength+1];
BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line = null;
while ((line = reader.readLine()) != null) {
// # starts a comment line
if (line.charAt(0) != '#") {
int spacepos = line.indexOf(' ");
String ngramsequence = line.substring(0, spacepos).trim();
int len = ngramsequence.length();
if ((len >= minLength) && (len < = maxLength)) {
int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
ngrams.put(en.getSeq(), en);
ngramcounts[len] += ngramcount;
}
}
}
normalize();
}
Loads a ngram profile from an InputStream
(assumes UTF-8 encoded content) |
public static void main(String[] args) {
String usage = "Usage: NGramProfile " +
"[-create profilename filename encoding] " +
"[-similarity file1 file2] "+
"[-score profile-name filename encoding]";
int command = 0;
final int CREATE = 1;
final int SIMILARITY = 2;
final int SCORE = 3;
String profilename = "";
String filename = "";
String filename2 = "";
String encoding = "";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-create")) { // found -create option
command = CREATE;
profilename = args[++i];
filename = args[++i];
encoding = args[++i];
}
if (args[i].equals("-similarity")) { // found -similarity option
command = SIMILARITY;
filename = args[++i];
filename2 = args[++i];
encoding = args[++i];
}
if (args[i].equals("-score")) { // found -Score option
command = SCORE;
profilename = args[++i];
filename = args[++i];
encoding = args[++i];
}
}
try {
switch (command) {
case CREATE:
File f = new File(filename);
FileInputStream fis = new FileInputStream(f);
NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
fis.close();
f = new File(profilename + "." + FILE_EXTENSION);
FileOutputStream fos = new FileOutputStream(f);
newProfile.save(fos);
System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
break;
case SIMILARITY:
f = new File(filename);
fis = new FileInputStream(f);
newProfile = NGramProfile.create(filename, fis, encoding);
newProfile.normalize();
f = new File(filename2);
fis = new FileInputStream(f);
NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
newProfile2.normalize();
System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
break;
case SCORE:
f = new File(filename);
fis = new FileInputStream(f);
newProfile = NGramProfile.create(filename, fis, encoding);
f = new File(profilename + "." + FILE_EXTENSION);
fis = new FileInputStream(f);
NGramProfile compare = new NGramProfile(profilename,
DEFAULT_MIN_NGRAM_LENGTH,
DEFAULT_MAX_NGRAM_LENGTH);
compare.load(fis);
System.out.println("Score is " + compare.getSimilarity(newProfile));
break;
}
} catch (Exception e) {
if (LOG.isFatalEnabled()) { LOG.fatal("Caught an exception:" + e); }
}
}
main method used for testing only |
protected void normalize() {
NGramEntry e = null;
//List sorted = getSorted();
Iterator< NGramEntry > i = ngrams.values().iterator();
// Calculate ngramcount if not already done
if (ngramcounts == null) {
ngramcounts = new int[maxLength+1];
while (i.hasNext()) {
e = i.next();
ngramcounts[e.size()] += e.count;
}
}
i = ngrams.values().iterator();
while (i.hasNext()) {
e = i.next();
e.frequency = (float) e.count / (float) ngramcounts[e.size()];
}
}
Normalize the profile (calculates the ngrams frequencies) |
public void save(OutputStream os) throws IOException {
// Write header
os.write(("# NgramProfile generated at " + new Date() +
" for Nutch Language Identification\n").getBytes());
// And then each ngram
// First dispatch ngrams in many lists depending on their size
// (one list for each size, in order to store MAX_SIZE ngrams for each
// size of ngram)
List< NGramEntry > list = new ArrayList< NGramEntry >();
List< NGramEntry > sublist = new ArrayList< NGramEntry >();
NGramEntry[] entries = ngrams.values().toArray(new NGramEntry[ngrams.size()]);
for (int i=minLength; i< =maxLength; i++) {
for (int j=0; j< entries.length; j++) {
if (entries[j].getSeq().length() == i) {
sublist.add(entries[j]);
}
}
Collections.sort(sublist);
if (sublist.size() > MAX_SIZE) {
sublist = sublist.subList(0, MAX_SIZE);
}
list.addAll(sublist);
sublist.clear();
}
for (int i=0; i< list.size(); i++) {
NGramEntry e = list.get(i);
String line = e.toString() + " " + e.getCount() + "\n";
os.write(line.getBytes("UTF-8"));
}
os.flush();
}
Writes NGramProfile content into OutputStream, content is outputted with
UTF-8 encoding |
public String toString() {
StringBuffer s = new StringBuffer().append("NGramProfile: ")
.append(name).append("\n");
Iterator< NGramEntry > i = getSorted().iterator();
while (i.hasNext()) {
NGramEntry entry = i.next();
s.append("[").append(entry.seq)
.append("/").append(entry.count)
.append("/").append(entry.frequency).append("]\n");
}
return s.toString();
}
|