1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.nutch.protocol;
19
20 //JDK imports
21 import java.io.ByteArrayInputStream;
22 import java.io.DataInput;
23 import java.io.DataInputStream;
24 import java.io.DataOutput;
25 import java.io.IOException;
26 import java.util.Arrays;
27 import java.util.zip.InflaterInputStream;
28
29 //Hadoop imports
30 import org.apache.hadoop.conf.Configuration;
31 import org.apache.hadoop.fs.FileSystem;
32 import org.apache.hadoop.fs.Path;
33 import org.apache.hadoop.io.ArrayFile;
34 import org.apache.hadoop.io.Text;
35 import org.apache.hadoop.io.UTF8;
36 import org.apache.hadoop.io.VersionMismatchException;
37 import org.apache.hadoop.io.Writable;
38
39 //Nutch imports
40 import org.apache.nutch.metadata.Metadata;
41 import org.apache.nutch.util.MimeUtil;
42 import org.apache.nutch.util.NutchConfiguration;
43
44 public final class Content implements Writable{
45
46 public static final String DIR_NAME = "content";
47
48 private final static int VERSION = -1;
49
50 private int version;
51
52 private String url;
53
54 private String base;
55
56 private byte[] content;
57
58 private String contentType;
59
60 private Metadata metadata;
61
62 private MimeUtil mimeTypes;
63
64 public Content() {
65 metadata = new Metadata();
66 }
67
68 public Content(String url, String base, byte[] content, String contentType,
69 Metadata metadata, Configuration conf) {
70
71 if (url == null)
72 throw new IllegalArgumentException("null url");
73 if (base == null)
74 throw new IllegalArgumentException("null base");
75 if (content == null)
76 throw new IllegalArgumentException("null content");
77 if (metadata == null)
78 throw new IllegalArgumentException("null metadata");
79
80 this.url = url;
81 this.base = base;
82 this.content = content;
83 this.metadata = metadata;
84
85 this.mimeTypes = new MimeUtil(conf);
86 this.contentType = getContentType(contentType, url, content);
87 }
88
89 private final void readFieldsCompressed(DataInput in) throws IOException {
90 byte oldVersion = in.readByte();
91 switch (oldVersion) {
92 case 0:
93 case 1:
94 url = UTF8.readString(in); // read url
95 base = UTF8.readString(in); // read base
96
97 content = new byte[in.readInt()]; // read content
98 in.readFully(content);
99
100 contentType = UTF8.readString(in); // read contentType
101 // reconstruct metadata
102 int keySize = in.readInt();
103 String key;
104 for (int i = 0; i < keySize; i++) {
105 key = UTF8.readString(in);
106 int valueSize = in.readInt();
107 for (int j = 0; j < valueSize; j++) {
108 metadata.add(key, UTF8.readString(in));
109 }
110 }
111 break;
112 case 2:
113 url = Text.readString(in); // read url
114 base = Text.readString(in); // read base
115
116 content = new byte[in.readInt()]; // read content
117 in.readFully(content);
118
119 contentType = Text.readString(in); // read contentType
120 metadata.readFields(in); // read meta data
121 break;
122 default:
123 throw new VersionMismatchException((byte)2, oldVersion);
124 }
125
126 }
127
128 public final void readFields(DataInput in) throws IOException {
129 metadata.clear();
130 int sizeOrVersion = in.readInt();
131 if (sizeOrVersion < 0) { // version
132 version = sizeOrVersion;
133 switch (version) {
134 case VERSION:
135 url = Text.readString(in);
136 base = Text.readString(in);
137
138 content = new byte[in.readInt()];
139 in.readFully(content);
140
141 contentType = Text.readString(in);
142 metadata.readFields(in);
143 break;
144 default:
145 throw new VersionMismatchException((byte)VERSION, (byte)version);
146 }
147 } else { // size
148 byte[] compressed = new byte[sizeOrVersion];
149 in.readFully(compressed, 0, compressed.length);
150 ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
151 DataInput inflater =
152 new DataInputStream(new InflaterInputStream(deflated));
153 readFieldsCompressed(inflater);
154 }
155 }
156
157 public final void write(DataOutput out) throws IOException {
158 out.writeInt(VERSION);
159
160 Text.writeString(out, url); // write url
161 Text.writeString(out, base); // write base
162
163 out.writeInt(content.length); // write content
164 out.write(content);
165
166 Text.writeString(out, contentType); // write contentType
167
168 metadata.write(out); // write metadata
169 }
170
171 public static Content read(DataInput in) throws IOException {
172 Content content = new Content();
173 content.readFields(in);
174 return content;
175 }
176
177 //
178 // Accessor methods
179 //
180
181 /** The url fetched. */
182 public String getUrl() {
183 return url;
184 }
185
186 /** The base url for relative links contained in the content.
187 * Maybe be different from url if the request redirected.
188 */
189 public String getBaseUrl() {
190 return base;
191 }
192
193 /** The binary content retrieved. */
194 public byte[] getContent() {
195 return content;
196 }
197
198 public void setContent(byte[] content) {
199 this.content = content;
200 }
201
202 /** The media type of the retrieved content.
203 * @see <a href="http://www.iana.org/assignments/media-types/">
204 * http://www.iana.org/assignments/media-types/</a>
205 */
206 public String getContentType() {
207 return contentType;
208 }
209
210 public void setContentType(String contentType) {
211 this.contentType = contentType;
212 }
213
214 /** Other protocol-specific data. */
215 public Metadata getMetadata() {
216 return metadata;
217 }
218
219 /** Other protocol-specific data. */
220 public void setMetadata(Metadata metadata) {
221 this.metadata = metadata;
222 }
223
224 public boolean equals(Object o) {
225 if (!(o instanceof Content)) {
226 return false;
227 }
228 Content that = (Content) o;
229 return this.url.equals(that.url) && this.base.equals(that.base)
230 && Arrays.equals(this.getContent(), that.getContent())
231 && this.contentType.equals(that.contentType)
232 && this.metadata.equals(that.metadata);
233 }
234
235 public String toString() {
236 StringBuffer buffer = new StringBuffer();
237
238 buffer.append("Version: " + version + "\n");
239 buffer.append("url: " + url + "\n");
240 buffer.append("base: " + base + "\n");
241 buffer.append("contentType: " + contentType + "\n");
242 buffer.append("metadata: " + metadata + "\n");
243 buffer.append("Content:\n");
244 buffer.append(new String(content)); // try default encoding
245
246 return buffer.toString();
247
248 }
249
250 public static void main(String argv[]) throws Exception {
251
252 String usage = "Content (-local | -dfs <namenode:port>) recno segment";
253
254 if (argv.length < 3) {
255 System.out.println("usage:" + usage);
256 return;
257 }
258 Configuration conf = NutchConfiguration.create();
259 FileSystem fs = FileSystem.parseArgs(argv, 0, conf);
260 try {
261 int recno = Integer.parseInt(argv[0]);
262 String segment = argv[1];
263
264 Path file = new Path(segment, DIR_NAME);
265 System.out.println("Reading from file: " + file);
266
267 ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
268 conf);
269
270 Content content = new Content();
271 contents.get(recno, content);
272 System.out.println("Retrieved " + recno + " from file " + file);
273
274 System.out.println(content);
275
276 contents.close();
277 } finally {
278 fs.close();
279 }
280 }
281
282 private String getContentType(String typeName, String url, byte[] data) {
283 return this.mimeTypes.autoResolveContentType(typeName, url, data);
284 }
285
286 }