Save This Page
Home » nutch-1.0 » org.apache.nutch » crawl » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.nutch.crawl;
   19   
   20   import java.io;
   21   import java.util;
   22   import java.util.Map.Entry;
   23   
   24   import org.apache.hadoop.io;
   25   import org.apache.nutch.util;
   26   
   27   /* The crawl state of a url. */
   28   public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
   29     public static final String GENERATE_DIR_NAME = "crawl_generate";
   30     public static final String FETCH_DIR_NAME = "crawl_fetch";
   31     public static final String PARSE_DIR_NAME = "crawl_parse";
   32   
   33     private final static byte CUR_VERSION = 7;
   34   
   35     /** Compatibility values for on-the-fly conversion from versions < 5. */
   36     private static final byte OLD_STATUS_SIGNATURE = 0;
   37     private static final byte OLD_STATUS_DB_UNFETCHED = 1;
   38     private static final byte OLD_STATUS_DB_FETCHED = 2;
   39     private static final byte OLD_STATUS_DB_GONE = 3;
   40     private static final byte OLD_STATUS_LINKED = 4;
   41     private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
   42     private static final byte OLD_STATUS_FETCH_RETRY = 6;
   43     private static final byte OLD_STATUS_FETCH_GONE = 7;
   44     
   45     private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>();
   46     
   47     /** Page was not fetched yet. */
   48     public static final byte STATUS_DB_UNFETCHED      = 0x01;
   49     /** Page was successfully fetched. */
   50     public static final byte STATUS_DB_FETCHED        = 0x02;
   51     /** Page no longer exists. */
   52     public static final byte STATUS_DB_GONE           = 0x03;
   53     /** Page temporarily redirects to other page. */
   54     public static final byte STATUS_DB_REDIR_TEMP     = 0x04;
   55     /** Page permanently redirects to other page. */
   56     public static final byte STATUS_DB_REDIR_PERM     = 0x05;
   57     /** Page was successfully fetched and found not modified. */
   58     public static final byte STATUS_DB_NOTMODIFIED    = 0x06;
   59     
   60     /** Maximum value of DB-related status. */
   61     public static final byte STATUS_DB_MAX            = 0x1f;
   62     
   63     /** Fetching was successful. */
   64     public static final byte STATUS_FETCH_SUCCESS     = 0x21;
   65     /** Fetching unsuccessful, needs to be retried (transient errors). */
   66     public static final byte STATUS_FETCH_RETRY       = 0x22;
   67     /** Fetching temporarily redirected to other page. */
   68     public static final byte STATUS_FETCH_REDIR_TEMP  = 0x23;
   69     /** Fetching permanently redirected to other page. */
   70     public static final byte STATUS_FETCH_REDIR_PERM  = 0x24;
   71     /** Fetching unsuccessful - page is gone. */
   72     public static final byte STATUS_FETCH_GONE        = 0x25;
   73     /** Fetching successful - page is not modified. */
   74     public static final byte STATUS_FETCH_NOTMODIFIED = 0x26;
   75     
   76     /** Maximum value of fetch-related status. */
   77     public static final byte STATUS_FETCH_MAX         = 0x3f;
   78     
   79     /** Page signature. */
   80     public static final byte STATUS_SIGNATURE         = 0x41;
   81     /** Page was newly injected. */
   82     public static final byte STATUS_INJECTED          = 0x42;
   83     /** Page discovered through a link. */
   84     public static final byte STATUS_LINKED            = 0x43;
   85     
   86     
   87     public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>();
   88     static {
   89       statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
   90       statNames.put(STATUS_DB_FETCHED, "db_fetched");
   91       statNames.put(STATUS_DB_GONE, "db_gone");
   92       statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp");
   93       statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
   94       statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
   95       statNames.put(STATUS_SIGNATURE, "signature");
   96       statNames.put(STATUS_INJECTED, "injected");
   97       statNames.put(STATUS_LINKED, "linked");
   98       statNames.put(STATUS_FETCH_SUCCESS, "fetch_success");
   99       statNames.put(STATUS_FETCH_RETRY, "fetch_retry");
  100       statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp");
  101       statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
  102       statNames.put(STATUS_FETCH_GONE, "fetch_gone");
  103       statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
  104       
  105       oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
  106       oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
  107       oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
  108       oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE);
  109       oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS);
  110       oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY);
  111       oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED);
  112       oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE);
  113     }
  114   
  115     private byte status;
  116     private long fetchTime = System.currentTimeMillis();
  117     private byte retries;
  118     private int fetchInterval;
  119     private float score = 1.0f;
  120     private byte[] signature = null;
  121     private long modifiedTime;
  122     private org.apache.hadoop.io.MapWritable metaData;
  123     
  124     public static boolean hasDbStatus(CrawlDatum datum) {
  125       if (datum.status <= STATUS_DB_MAX) return true;
  126       return false;
  127     }
  128   
  129     public static boolean hasFetchStatus(CrawlDatum datum) {
  130       if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX) return true;
  131       return false;
  132     }
  133   
  134     public CrawlDatum() {
  135       metaData = new org.apache.hadoop.io.MapWritable();
  136     }
  137   
  138     public CrawlDatum(int status, int fetchInterval) {
  139       this();
  140       this.status = (byte)status;
  141       this.fetchInterval = fetchInterval;
  142     }
  143   
  144     public CrawlDatum(int status, int fetchInterval, float score) {
  145       this(status, fetchInterval);
  146       this.score = score;
  147     }
  148   
  149     //
  150     // accessor methods
  151     //
  152   
  153     public byte getStatus() { return status; }
  154     
  155     public static String getStatusName(byte value) {
  156       String res = statNames.get(value);
  157       if (res == null) res = "unknown";
  158       return res;
  159     }
  160     
  161     public void setStatus(int status) { this.status = (byte)status; }
  162   
  163     /**
  164      * Returns either the time of the last fetch, or the next fetch time,
  165      * depending on whether Fetcher or CrawlDbReducer set the time.
  166      */
  167     public long getFetchTime() { return fetchTime; }
  168     /**
  169      * Sets either the time of the last fetch or the next fetch time,
  170      * depending on whether Fetcher or CrawlDbReducer set the time.
  171      */
  172     public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; }
  173   
  174     public long getModifiedTime() {
  175       return modifiedTime;
  176     }
  177   
  178     public void setModifiedTime(long modifiedTime) {
  179       this.modifiedTime = modifiedTime;
  180     }
  181     
  182     public byte getRetriesSinceFetch() { return retries; }
  183     public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
  184   
  185     public int getFetchInterval() { return fetchInterval; }
  186     public void setFetchInterval(int fetchInterval) {
  187       this.fetchInterval = fetchInterval;
  188     }
  189     public void setFetchInterval(float fetchInterval) {
  190       this.fetchInterval = Math.round(fetchInterval);
  191     }
  192   
  193     public float getScore() { return score; }
  194     public void setScore(float score) { this.score = score; }
  195   
  196     public byte[] getSignature() {
  197       return signature;
  198     }
  199   
  200     public void setSignature(byte[] signature) {
  201       if (signature != null && signature.length > 256)
  202         throw new RuntimeException("Max signature length (256) exceeded: " + signature.length);
  203       this.signature = signature;
  204     }
  205     
  206      public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
  207        this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
  208      }
  209      
  210      /** Add all metadata from other CrawlDatum to this CrawlDatum.
  211       * 
  212       * @param other CrawlDatum
  213       */
  214      public void putAllMetaData(CrawlDatum other) {
  215        for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
  216          metaData.put(e.getKey(), e.getValue());
  217        }
  218      }
  219   
  220     /**
  221      * returns a MapWritable if it was set or read in @see readFields(DataInput), 
  222      * returns empty map in case CrawlDatum was freshly created (lazily instantiated).
  223      */
  224     public org.apache.hadoop.io.MapWritable getMetaData() {
  225       if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable();
  226       return this.metaData;
  227     }
  228     
  229   
  230     //
  231     // writable methods
  232     //
  233   
  234     public static CrawlDatum read(DataInput in) throws IOException {
  235       CrawlDatum result = new CrawlDatum();
  236       result.readFields(in);
  237       return result;
  238     }
  239   
  240     public void readFields(DataInput in) throws IOException {
  241       byte version = in.readByte();                 // read version
  242       if (version > CUR_VERSION)                   // check version
  243         throw new VersionMismatchException(CUR_VERSION, version);
  244   
  245       status = in.readByte();
  246       fetchTime = in.readLong();
  247       retries = in.readByte();
  248       if (version > 5) {
  249         fetchInterval = in.readInt();
  250       } else fetchInterval = Math.round(in.readFloat());
  251       score = in.readFloat();
  252       if (version > 2) {
  253         modifiedTime = in.readLong();
  254         int cnt = in.readByte();
  255         if (cnt > 0) {
  256           signature = new byte[cnt];
  257           in.readFully(signature);
  258         } else signature = null;
  259       }
  260       metaData = new org.apache.hadoop.io.MapWritable();
  261       if (version > 3) {
  262         if (version < 7) {
  263           MapWritable oldMetaData = new MapWritable();
  264           if (in.readBoolean()) {
  265             oldMetaData.readFields(in);
  266           }
  267           for (Writable key : oldMetaData.keySet()) {
  268             metaData.put(key, oldMetaData.get(key));
  269           }
  270         } else {
  271           if (in.readBoolean()) {
  272             metaData.readFields(in);
  273           }
  274         }
  275       }
  276       // translate status codes
  277       if (version < 5) {
  278         if (oldToNew.containsKey(status))
  279           status = oldToNew.get(status);
  280         else
  281           status = STATUS_DB_UNFETCHED;
  282         
  283       }
  284     }
  285   
  286     /** The number of bytes into a CrawlDatum that the score is stored. */
  287     private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
  288     private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;
  289   
  290     public void write(DataOutput out) throws IOException {
  291       out.writeByte(CUR_VERSION);                   // store current version
  292       out.writeByte(status);
  293       out.writeLong(fetchTime);
  294       out.writeByte(retries);
  295       out.writeInt(fetchInterval);
  296       out.writeFloat(score);
  297       out.writeLong(modifiedTime);
  298       if (signature == null) {
  299         out.writeByte(0);
  300       } else {
  301         out.writeByte(signature.length);
  302         out.write(signature);
  303       }
  304       if (metaData.size() > 0) {
  305         out.writeBoolean(true);
  306         metaData.write(out);
  307       } else {
  308         out.writeBoolean(false);
  309       }
  310     }
  311   
  312     /** Copy the contents of another instance into this instance. */
  313     public void set(CrawlDatum that) {
  314       this.status = that.status;
  315       this.fetchTime = that.fetchTime;
  316       this.retries = that.retries;
  317       this.fetchInterval = that.fetchInterval;
  318       this.score = that.score;
  319       this.modifiedTime = that.modifiedTime;
  320       this.signature = that.signature;
  321       this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy
  322     }
  323   
  324   
  325     //
  326     // compare methods
  327     //
  328     
  329     /** Sort by decreasing score. */
  330     public int compareTo(CrawlDatum that) {
  331       if (that.score != this.score)
  332         return (that.score - this.score) > 0 ? 1 : -1;
  333       if (that.status != this.status)
  334         return this.status - that.status;
  335       if (that.fetchTime != this.fetchTime)
  336         return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1;
  337       if (that.retries != this.retries)
  338         return that.retries - this.retries;
  339       if (that.fetchInterval != this.fetchInterval)
  340         return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
  341       if (that.modifiedTime != this.modifiedTime)
  342         return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
  343       return SignatureComparator._compare(this, that);
  344     }
  345   
  346     /** A Comparator optimized for CrawlDatum. */ 
  347     public static class Comparator extends WritableComparator {
  348       public Comparator() { super(CrawlDatum.class); }
  349   
  350       public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
  351         float score1 = readFloat(b1,s1+SCORE_OFFSET);
  352         float score2 = readFloat(b2,s2+SCORE_OFFSET);
  353         if (score2 != score1) {
  354           return (score2 - score1) > 0 ? 1 : -1;
  355         }
  356         int status1 = b1[s1+1];
  357         int status2 = b2[s2+1];
  358         if (status2 != status1)
  359           return status1 - status2;
  360         long fetchTime1 = readLong(b1, s1+1+1);
  361         long fetchTime2 = readLong(b2, s2+1+1);
  362         if (fetchTime2 != fetchTime1)
  363           return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;
  364         int retries1 = b1[s1+1+1+8];
  365         int retries2 = b2[s2+1+1+8];
  366         if (retries2 != retries1)
  367           return retries2 - retries1;
  368         int fetchInterval1 = readInt(b1, s1+1+1+8+1);
  369         int fetchInterval2 = readInt(b2, s2+1+1+8+1);
  370         if (fetchInterval2 != fetchInterval1)
  371           return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
  372         long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
  373         long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
  374         if (modifiedTime2 != modifiedTime1)
  375           return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
  376         int sigl1 = b1[s1+SIG_OFFSET];
  377         int sigl2 = b2[s2+SIG_OFFSET];
  378         return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, SIG_OFFSET, sigl2);
  379       }
  380     }
  381   
  382     static {                                        // register this comparator
  383       WritableComparator.define(CrawlDatum.class, new Comparator());
  384     }
  385   
  386   
  387     //
  388     // basic methods
  389     //
  390   
  391     public String toString() {
  392       StringBuilder buf = new StringBuilder();
  393       buf.append("Version: " + CUR_VERSION + "\n");
  394       buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n");
  395       buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
  396       buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
  397       buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
  398       buf.append("Retry interval: " + getFetchInterval() + " seconds (" +
  399           (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
  400       buf.append("Score: " + getScore() + "\n");
  401       buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
  402       buf.append("Metadata: ");
  403       for (Entry<Writable, Writable> e : metaData.entrySet()) {
  404         buf.append(e.getKey());
  405         buf.append(": ");
  406         buf.append(e.getValue());
  407       }
  408       buf.append('\n');
  409       return buf.toString();
  410     }
  411     
  412     private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
  413       HashSet<Entry<Writable, Writable>> set1 =
  414         new HashSet<Entry<Writable,Writable>>(metaData.entrySet());
  415       HashSet<Entry<Writable, Writable>> set2 =
  416         new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet());
  417       return set1.equals(set2);
  418     }
  419   
  420     public boolean equals(Object o) {
  421       if (!(o instanceof CrawlDatum))
  422         return false;
  423       CrawlDatum other = (CrawlDatum)o;
  424       boolean res =
  425         (this.status == other.status) &&
  426         (this.fetchTime == other.fetchTime) &&
  427         (this.modifiedTime == other.modifiedTime) &&
  428         (this.retries == other.retries) &&
  429         (this.fetchInterval == other.fetchInterval) &&
  430         (SignatureComparator._compare(this.signature, other.signature) == 0) &&
  431         (this.score == other.score);
  432       if (!res) return res;
  433       return metadataEquals(other.metaData);
  434     }
  435   
  436     public int hashCode() {
  437       int res = 0;
  438       if (signature != null) {
  439         for (int i = 0; i < signature.length / 4; i += 4) {
  440           res ^= (int)(signature[i] << 24 + signature[i+1] << 16 +
  441                   signature[i+2] << 8 + signature[i+3]);
  442         }
  443       }
  444       res ^= metaData.entrySet().hashCode();
  445       return
  446         res ^ status ^
  447         ((int)fetchTime) ^
  448         ((int)modifiedTime) ^
  449         retries ^
  450         fetchInterval ^
  451         Float.floatToIntBits(score);
  452     }
  453   
  454     public Object clone() {
  455       try {
  456         return super.clone();
  457       } catch (CloneNotSupportedException e) {
  458         throw new RuntimeException(e);
  459       }
  460     }
  461   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » crawl » [javadoc | source]