1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.nutch.crawl;
19
20 import java.io;
21 import java.util;
22 import java.util.Map.Entry;
23
24 import org.apache.hadoop.io;
25 import org.apache.nutch.util;
26
27 /* The crawl state of a url. */
28 public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
29 public static final String GENERATE_DIR_NAME = "crawl_generate";
30 public static final String FETCH_DIR_NAME = "crawl_fetch";
31 public static final String PARSE_DIR_NAME = "crawl_parse";
32
33 private final static byte CUR_VERSION = 7;
34
35 /** Compatibility values for on-the-fly conversion from versions < 5. */
36 private static final byte OLD_STATUS_SIGNATURE = 0;
37 private static final byte OLD_STATUS_DB_UNFETCHED = 1;
38 private static final byte OLD_STATUS_DB_FETCHED = 2;
39 private static final byte OLD_STATUS_DB_GONE = 3;
40 private static final byte OLD_STATUS_LINKED = 4;
41 private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
42 private static final byte OLD_STATUS_FETCH_RETRY = 6;
43 private static final byte OLD_STATUS_FETCH_GONE = 7;
44
45 private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>();
46
47 /** Page was not fetched yet. */
48 public static final byte STATUS_DB_UNFETCHED = 0x01;
49 /** Page was successfully fetched. */
50 public static final byte STATUS_DB_FETCHED = 0x02;
51 /** Page no longer exists. */
52 public static final byte STATUS_DB_GONE = 0x03;
53 /** Page temporarily redirects to other page. */
54 public static final byte STATUS_DB_REDIR_TEMP = 0x04;
55 /** Page permanently redirects to other page. */
56 public static final byte STATUS_DB_REDIR_PERM = 0x05;
57 /** Page was successfully fetched and found not modified. */
58 public static final byte STATUS_DB_NOTMODIFIED = 0x06;
59
60 /** Maximum value of DB-related status. */
61 public static final byte STATUS_DB_MAX = 0x1f;
62
63 /** Fetching was successful. */
64 public static final byte STATUS_FETCH_SUCCESS = 0x21;
65 /** Fetching unsuccessful, needs to be retried (transient errors). */
66 public static final byte STATUS_FETCH_RETRY = 0x22;
67 /** Fetching temporarily redirected to other page. */
68 public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
69 /** Fetching permanently redirected to other page. */
70 public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
71 /** Fetching unsuccessful - page is gone. */
72 public static final byte STATUS_FETCH_GONE = 0x25;
73 /** Fetching successful - page is not modified. */
74 public static final byte STATUS_FETCH_NOTMODIFIED = 0x26;
75
76 /** Maximum value of fetch-related status. */
77 public static final byte STATUS_FETCH_MAX = 0x3f;
78
79 /** Page signature. */
80 public static final byte STATUS_SIGNATURE = 0x41;
81 /** Page was newly injected. */
82 public static final byte STATUS_INJECTED = 0x42;
83 /** Page discovered through a link. */
84 public static final byte STATUS_LINKED = 0x43;
85
86
87 public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>();
88 static {
89 statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
90 statNames.put(STATUS_DB_FETCHED, "db_fetched");
91 statNames.put(STATUS_DB_GONE, "db_gone");
92 statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp");
93 statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
94 statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
95 statNames.put(STATUS_SIGNATURE, "signature");
96 statNames.put(STATUS_INJECTED, "injected");
97 statNames.put(STATUS_LINKED, "linked");
98 statNames.put(STATUS_FETCH_SUCCESS, "fetch_success");
99 statNames.put(STATUS_FETCH_RETRY, "fetch_retry");
100 statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp");
101 statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
102 statNames.put(STATUS_FETCH_GONE, "fetch_gone");
103 statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
104
105 oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
106 oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
107 oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
108 oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE);
109 oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS);
110 oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY);
111 oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED);
112 oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE);
113 }
114
115 private byte status;
116 private long fetchTime = System.currentTimeMillis();
117 private byte retries;
118 private int fetchInterval;
119 private float score = 1.0f;
120 private byte[] signature = null;
121 private long modifiedTime;
122 private org.apache.hadoop.io.MapWritable metaData;
123
124 public static boolean hasDbStatus(CrawlDatum datum) {
125 if (datum.status <= STATUS_DB_MAX) return true;
126 return false;
127 }
128
129 public static boolean hasFetchStatus(CrawlDatum datum) {
130 if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX) return true;
131 return false;
132 }
133
134 public CrawlDatum() {
135 metaData = new org.apache.hadoop.io.MapWritable();
136 }
137
138 public CrawlDatum(int status, int fetchInterval) {
139 this();
140 this.status = (byte)status;
141 this.fetchInterval = fetchInterval;
142 }
143
144 public CrawlDatum(int status, int fetchInterval, float score) {
145 this(status, fetchInterval);
146 this.score = score;
147 }
148
149 //
150 // accessor methods
151 //
152
153 public byte getStatus() { return status; }
154
155 public static String getStatusName(byte value) {
156 String res = statNames.get(value);
157 if (res == null) res = "unknown";
158 return res;
159 }
160
161 public void setStatus(int status) { this.status = (byte)status; }
162
163 /**
164 * Returns either the time of the last fetch, or the next fetch time,
165 * depending on whether Fetcher or CrawlDbReducer set the time.
166 */
167 public long getFetchTime() { return fetchTime; }
168 /**
169 * Sets either the time of the last fetch or the next fetch time,
170 * depending on whether Fetcher or CrawlDbReducer set the time.
171 */
172 public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; }
173
174 public long getModifiedTime() {
175 return modifiedTime;
176 }
177
178 public void setModifiedTime(long modifiedTime) {
179 this.modifiedTime = modifiedTime;
180 }
181
182 public byte getRetriesSinceFetch() { return retries; }
183 public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
184
185 public int getFetchInterval() { return fetchInterval; }
186 public void setFetchInterval(int fetchInterval) {
187 this.fetchInterval = fetchInterval;
188 }
189 public void setFetchInterval(float fetchInterval) {
190 this.fetchInterval = Math.round(fetchInterval);
191 }
192
193 public float getScore() { return score; }
194 public void setScore(float score) { this.score = score; }
195
196 public byte[] getSignature() {
197 return signature;
198 }
199
200 public void setSignature(byte[] signature) {
201 if (signature != null && signature.length > 256)
202 throw new RuntimeException("Max signature length (256) exceeded: " + signature.length);
203 this.signature = signature;
204 }
205
206 public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
207 this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
208 }
209
210 /** Add all metadata from other CrawlDatum to this CrawlDatum.
211 *
212 * @param other CrawlDatum
213 */
214 public void putAllMetaData(CrawlDatum other) {
215 for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
216 metaData.put(e.getKey(), e.getValue());
217 }
218 }
219
220 /**
221 * returns a MapWritable if it was set or read in @see readFields(DataInput),
222 * returns empty map in case CrawlDatum was freshly created (lazily instantiated).
223 */
224 public org.apache.hadoop.io.MapWritable getMetaData() {
225 if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable();
226 return this.metaData;
227 }
228
229
230 //
231 // writable methods
232 //
233
234 public static CrawlDatum read(DataInput in) throws IOException {
235 CrawlDatum result = new CrawlDatum();
236 result.readFields(in);
237 return result;
238 }
239
240 public void readFields(DataInput in) throws IOException {
241 byte version = in.readByte(); // read version
242 if (version > CUR_VERSION) // check version
243 throw new VersionMismatchException(CUR_VERSION, version);
244
245 status = in.readByte();
246 fetchTime = in.readLong();
247 retries = in.readByte();
248 if (version > 5) {
249 fetchInterval = in.readInt();
250 } else fetchInterval = Math.round(in.readFloat());
251 score = in.readFloat();
252 if (version > 2) {
253 modifiedTime = in.readLong();
254 int cnt = in.readByte();
255 if (cnt > 0) {
256 signature = new byte[cnt];
257 in.readFully(signature);
258 } else signature = null;
259 }
260 metaData = new org.apache.hadoop.io.MapWritable();
261 if (version > 3) {
262 if (version < 7) {
263 MapWritable oldMetaData = new MapWritable();
264 if (in.readBoolean()) {
265 oldMetaData.readFields(in);
266 }
267 for (Writable key : oldMetaData.keySet()) {
268 metaData.put(key, oldMetaData.get(key));
269 }
270 } else {
271 if (in.readBoolean()) {
272 metaData.readFields(in);
273 }
274 }
275 }
276 // translate status codes
277 if (version < 5) {
278 if (oldToNew.containsKey(status))
279 status = oldToNew.get(status);
280 else
281 status = STATUS_DB_UNFETCHED;
282
283 }
284 }
285
286 /** The number of bytes into a CrawlDatum that the score is stored. */
287 private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
288 private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;
289
290 public void write(DataOutput out) throws IOException {
291 out.writeByte(CUR_VERSION); // store current version
292 out.writeByte(status);
293 out.writeLong(fetchTime);
294 out.writeByte(retries);
295 out.writeInt(fetchInterval);
296 out.writeFloat(score);
297 out.writeLong(modifiedTime);
298 if (signature == null) {
299 out.writeByte(0);
300 } else {
301 out.writeByte(signature.length);
302 out.write(signature);
303 }
304 if (metaData.size() > 0) {
305 out.writeBoolean(true);
306 metaData.write(out);
307 } else {
308 out.writeBoolean(false);
309 }
310 }
311
312 /** Copy the contents of another instance into this instance. */
313 public void set(CrawlDatum that) {
314 this.status = that.status;
315 this.fetchTime = that.fetchTime;
316 this.retries = that.retries;
317 this.fetchInterval = that.fetchInterval;
318 this.score = that.score;
319 this.modifiedTime = that.modifiedTime;
320 this.signature = that.signature;
321 this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy
322 }
323
324
325 //
326 // compare methods
327 //
328
329 /** Sort by decreasing score. */
330 public int compareTo(CrawlDatum that) {
331 if (that.score != this.score)
332 return (that.score - this.score) > 0 ? 1 : -1;
333 if (that.status != this.status)
334 return this.status - that.status;
335 if (that.fetchTime != this.fetchTime)
336 return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1;
337 if (that.retries != this.retries)
338 return that.retries - this.retries;
339 if (that.fetchInterval != this.fetchInterval)
340 return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
341 if (that.modifiedTime != this.modifiedTime)
342 return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
343 return SignatureComparator._compare(this, that);
344 }
345
346 /** A Comparator optimized for CrawlDatum. */
347 public static class Comparator extends WritableComparator {
348 public Comparator() { super(CrawlDatum.class); }
349
350 public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
351 float score1 = readFloat(b1,s1+SCORE_OFFSET);
352 float score2 = readFloat(b2,s2+SCORE_OFFSET);
353 if (score2 != score1) {
354 return (score2 - score1) > 0 ? 1 : -1;
355 }
356 int status1 = b1[s1+1];
357 int status2 = b2[s2+1];
358 if (status2 != status1)
359 return status1 - status2;
360 long fetchTime1 = readLong(b1, s1+1+1);
361 long fetchTime2 = readLong(b2, s2+1+1);
362 if (fetchTime2 != fetchTime1)
363 return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;
364 int retries1 = b1[s1+1+1+8];
365 int retries2 = b2[s2+1+1+8];
366 if (retries2 != retries1)
367 return retries2 - retries1;
368 int fetchInterval1 = readInt(b1, s1+1+1+8+1);
369 int fetchInterval2 = readInt(b2, s2+1+1+8+1);
370 if (fetchInterval2 != fetchInterval1)
371 return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
372 long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
373 long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
374 if (modifiedTime2 != modifiedTime1)
375 return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
376 int sigl1 = b1[s1+SIG_OFFSET];
377 int sigl2 = b2[s2+SIG_OFFSET];
378 return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, SIG_OFFSET, sigl2);
379 }
380 }
381
382 static { // register this comparator
383 WritableComparator.define(CrawlDatum.class, new Comparator());
384 }
385
386
387 //
388 // basic methods
389 //
390
391 public String toString() {
392 StringBuilder buf = new StringBuilder();
393 buf.append("Version: " + CUR_VERSION + "\n");
394 buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n");
395 buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
396 buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
397 buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
398 buf.append("Retry interval: " + getFetchInterval() + " seconds (" +
399 (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
400 buf.append("Score: " + getScore() + "\n");
401 buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
402 buf.append("Metadata: ");
403 for (Entry<Writable, Writable> e : metaData.entrySet()) {
404 buf.append(e.getKey());
405 buf.append(": ");
406 buf.append(e.getValue());
407 }
408 buf.append('\n');
409 return buf.toString();
410 }
411
412 private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
413 HashSet<Entry<Writable, Writable>> set1 =
414 new HashSet<Entry<Writable,Writable>>(metaData.entrySet());
415 HashSet<Entry<Writable, Writable>> set2 =
416 new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet());
417 return set1.equals(set2);
418 }
419
420 public boolean equals(Object o) {
421 if (!(o instanceof CrawlDatum))
422 return false;
423 CrawlDatum other = (CrawlDatum)o;
424 boolean res =
425 (this.status == other.status) &&
426 (this.fetchTime == other.fetchTime) &&
427 (this.modifiedTime == other.modifiedTime) &&
428 (this.retries == other.retries) &&
429 (this.fetchInterval == other.fetchInterval) &&
430 (SignatureComparator._compare(this.signature, other.signature) == 0) &&
431 (this.score == other.score);
432 if (!res) return res;
433 return metadataEquals(other.metaData);
434 }
435
436 public int hashCode() {
437 int res = 0;
438 if (signature != null) {
439 for (int i = 0; i < signature.length / 4; i += 4) {
440 res ^= (int)(signature[i] << 24 + signature[i+1] << 16 +
441 signature[i+2] << 8 + signature[i+3]);
442 }
443 }
444 res ^= metaData.entrySet().hashCode();
445 return
446 res ^ status ^
447 ((int)fetchTime) ^
448 ((int)modifiedTime) ^
449 retries ^
450 fetchInterval ^
451 Float.floatToIntBits(score);
452 }
453
454 public Object clone() {
455 try {
456 return super.clone();
457 } catch (CloneNotSupportedException e) {
458 throw new RuntimeException(e);
459 }
460 }
461 }