Save This Page
Home » nutch-1.0 » org.apache.nutch » protocol » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.nutch.protocol;
   19   
   20   //JDK imports
   21   import java.io.ByteArrayInputStream;
   22   import java.io.DataInput;
   23   import java.io.DataInputStream;
   24   import java.io.DataOutput;
   25   import java.io.IOException;
   26   import java.util.Arrays;
   27   import java.util.zip.InflaterInputStream;
   28   
   29   //Hadoop imports
   30   import org.apache.hadoop.conf.Configuration;
   31   import org.apache.hadoop.fs.FileSystem;
   32   import org.apache.hadoop.fs.Path;
   33   import org.apache.hadoop.io.ArrayFile;
   34   import org.apache.hadoop.io.Text;
   35   import org.apache.hadoop.io.UTF8;
   36   import org.apache.hadoop.io.VersionMismatchException;
   37   import org.apache.hadoop.io.Writable;
   38   
   39   //Nutch imports
   40   import org.apache.nutch.metadata.Metadata;
   41   import org.apache.nutch.util.MimeUtil;
   42   import org.apache.nutch.util.NutchConfiguration;
   43   
   44   public final class Content implements Writable{
   45   
   46     public static final String DIR_NAME = "content";
   47   
   48     private final static int VERSION = -1;
   49   
   50     private int version;
   51   
   52     private String url;
   53   
   54     private String base;
   55   
   56     private byte[] content;
   57   
   58     private String contentType;
   59   
   60     private Metadata metadata;
   61   
   62     private MimeUtil mimeTypes;
   63   
   64     public Content() {
   65       metadata = new Metadata();
   66     }
   67   
   68     public Content(String url, String base, byte[] content, String contentType,
   69         Metadata metadata, Configuration conf) {
   70   
   71       if (url == null)
   72         throw new IllegalArgumentException("null url");
   73       if (base == null)
   74         throw new IllegalArgumentException("null base");
   75       if (content == null)
   76         throw new IllegalArgumentException("null content");
   77       if (metadata == null)
   78         throw new IllegalArgumentException("null metadata");
   79   
   80       this.url = url;
   81       this.base = base;
   82       this.content = content;
   83       this.metadata = metadata;
   84   
   85       this.mimeTypes = new MimeUtil(conf);
   86       this.contentType = getContentType(contentType, url, content);
   87     }
   88   
   89     private final void readFieldsCompressed(DataInput in) throws IOException {
   90       byte oldVersion = in.readByte();
   91       switch (oldVersion) {
   92       case 0:
   93       case 1:
   94         url = UTF8.readString(in); // read url
   95         base = UTF8.readString(in); // read base
   96   
   97         content = new byte[in.readInt()]; // read content
   98         in.readFully(content);
   99   
  100         contentType = UTF8.readString(in); // read contentType
  101         // reconstruct metadata
  102         int keySize = in.readInt();
  103         String key;
  104         for (int i = 0; i < keySize; i++) {
  105           key = UTF8.readString(in);
  106           int valueSize = in.readInt();
  107           for (int j = 0; j < valueSize; j++) {
  108             metadata.add(key, UTF8.readString(in));
  109           }
  110         }
  111         break;
  112       case 2:
  113         url = Text.readString(in); // read url
  114         base = Text.readString(in); // read base
  115   
  116         content = new byte[in.readInt()]; // read content
  117         in.readFully(content);
  118   
  119         contentType = Text.readString(in); // read contentType
  120         metadata.readFields(in); // read meta data
  121         break;
  122       default:
  123         throw new VersionMismatchException((byte)2, oldVersion);
  124       }
  125   
  126     }
  127     
  128     public final void readFields(DataInput in) throws IOException {
  129       metadata.clear();
  130       int sizeOrVersion = in.readInt();
  131       if (sizeOrVersion < 0) { // version
  132         version = sizeOrVersion;
  133         switch (version) {
  134         case VERSION:
  135           url = Text.readString(in);
  136           base = Text.readString(in);
  137   
  138           content = new byte[in.readInt()];
  139           in.readFully(content);
  140   
  141           contentType = Text.readString(in);
  142           metadata.readFields(in);
  143           break;
  144         default:
  145           throw new VersionMismatchException((byte)VERSION, (byte)version);
  146         }
  147       } else { // size
  148         byte[] compressed = new byte[sizeOrVersion];
  149         in.readFully(compressed, 0, compressed.length);
  150         ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
  151         DataInput inflater =
  152           new DataInputStream(new InflaterInputStream(deflated));
  153         readFieldsCompressed(inflater);
  154       }
  155     }
  156   
  157     public final void write(DataOutput out) throws IOException {
  158       out.writeInt(VERSION);
  159   
  160       Text.writeString(out, url); // write url
  161       Text.writeString(out, base); // write base
  162   
  163       out.writeInt(content.length); // write content
  164       out.write(content);
  165   
  166       Text.writeString(out, contentType); // write contentType
  167   
  168       metadata.write(out); // write metadata
  169     }
  170   
  171     public static Content read(DataInput in) throws IOException {
  172       Content content = new Content();
  173       content.readFields(in);
  174       return content;
  175     }
  176   
  177     //
  178     // Accessor methods
  179     //
  180   
  181     /** The url fetched. */
  182     public String getUrl() {
  183       return url;
  184     }
  185   
  186     /** The base url for relative links contained in the content.
  187      * Maybe be different from url if the request redirected.
  188      */
  189     public String getBaseUrl() {
  190       return base;
  191     }
  192   
  193     /** The binary content retrieved. */
  194     public byte[] getContent() {
  195       return content;
  196     }
  197   
  198     public void setContent(byte[] content) {
  199       this.content = content;
  200     }
  201   
  202     /** The media type of the retrieved content.
  203      * @see <a href="http://www.iana.org/assignments/media-types/">
  204      *      http://www.iana.org/assignments/media-types/</a>
  205      */
  206     public String getContentType() {
  207       return contentType;
  208     }
  209   
  210     public void setContentType(String contentType) {
  211       this.contentType = contentType;
  212     }
  213   
  214     /** Other protocol-specific data. */
  215     public Metadata getMetadata() {
  216       return metadata;
  217     }
  218   
  219     /** Other protocol-specific data. */
  220     public void setMetadata(Metadata metadata) {
  221       this.metadata = metadata;
  222     }
  223   
  224     public boolean equals(Object o) {
  225       if (!(o instanceof Content)) {
  226         return false;
  227       }
  228       Content that = (Content) o;
  229       return this.url.equals(that.url) && this.base.equals(that.base)
  230           && Arrays.equals(this.getContent(), that.getContent())
  231           && this.contentType.equals(that.contentType)
  232           && this.metadata.equals(that.metadata);
  233     }
  234   
  235     public String toString() {
  236       StringBuffer buffer = new StringBuffer();
  237   
  238       buffer.append("Version: " + version + "\n");
  239       buffer.append("url: " + url + "\n");
  240       buffer.append("base: " + base + "\n");
  241       buffer.append("contentType: " + contentType + "\n");
  242       buffer.append("metadata: " + metadata + "\n");
  243       buffer.append("Content:\n");
  244       buffer.append(new String(content)); // try default encoding
  245   
  246       return buffer.toString();
  247   
  248     }
  249   
  250     public static void main(String argv[]) throws Exception {
  251   
  252       String usage = "Content (-local | -dfs <namenode:port>) recno segment";
  253   
  254       if (argv.length < 3) {
  255         System.out.println("usage:" + usage);
  256         return;
  257       }
  258       Configuration conf = NutchConfiguration.create();
  259       FileSystem fs = FileSystem.parseArgs(argv, 0, conf);
  260       try {
  261         int recno = Integer.parseInt(argv[0]);
  262         String segment = argv[1];
  263   
  264         Path file = new Path(segment, DIR_NAME);
  265         System.out.println("Reading from file: " + file);
  266   
  267         ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
  268             conf);
  269   
  270         Content content = new Content();
  271         contents.get(recno, content);
  272         System.out.println("Retrieved " + recno + " from file " + file);
  273   
  274         System.out.println(content);
  275   
  276         contents.close();
  277       } finally {
  278         fs.close();
  279       }
  280     }
  281   
  282     private String getContentType(String typeName, String url, byte[] data) {
  283       return this.mimeTypes.autoResolveContentType(typeName, url, data);
  284     }
  285   
  286   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » protocol » [javadoc | source]