1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.nutch.parse.html;
19
20 import java.net.URL;
21
22 import org.apache.nutch.parse.HTMLMetaTags;
23 import org.w3c.dom;
24
25 /**
26 * Class for parsing META Directives from DOM trees. This class
27 * handles specifically Robots META directives (all, none, nofollow,
28 * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
29 * instructions. All meta directives are stored in a HTMLMetaTags instance.
30 */
31 public class HTMLMetaProcessor {
32
33 /**
34 * Utility class with indicators for the robots directives "noindex"
35 * and "nofollow", and HTTP-EQUIV/no-cache
36 */
37
38 /**
39 * Sets the indicators in <code>robotsMeta</code> to appropriate
40 * values, based on any META tags found under the given
41 * <code>node</code>.
42 */
43 public static final void getMetaTags (
44 HTMLMetaTags metaTags, Node node, URL currURL) {
45
46 metaTags.reset();
47 getMetaTagsHelper(metaTags, node, currURL);
48 }
49
50 private static final void getMetaTagsHelper(
51 HTMLMetaTags metaTags, Node node, URL currURL) {
52
53 if (node.getNodeType() == Node.ELEMENT_NODE) {
54
55 if ("body".equalsIgnoreCase(node.getNodeName())) {
56 // META tags should not be under body
57 return;
58 }
59
60 if ("meta".equalsIgnoreCase(node.getNodeName())) {
61 NamedNodeMap attrs = node.getAttributes();
62 Node nameNode = null;
63 Node equivNode = null;
64 Node contentNode = null;
65 // Retrieves name, http-equiv and content attribues
66 for (int i=0; i<attrs.getLength(); i++) {
67 Node attr = attrs.item(i);
68 String attrName = attr.getNodeName().toLowerCase();
69 if (attrName.equals("name")) {
70 nameNode = attr;
71 } else if (attrName.equals("http-equiv")) {
72 equivNode = attr;
73 } else if (attrName.equals("content")) {
74 contentNode = attr;
75 }
76 }
77
78 if (nameNode != null) {
79 if (contentNode != null) {
80 String name = nameNode.getNodeValue().toLowerCase();
81 metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
82 if ("robots".equals(name)) {
83
84 if (contentNode != null) {
85 String directives =
86 contentNode.getNodeValue().toLowerCase();
87 int index = directives.indexOf("none");
88
89 if (index >= 0) {
90 metaTags.setNoIndex();
91 metaTags.setNoFollow();
92 }
93
94 index = directives.indexOf("all");
95 if (index >= 0) {
96 // do nothing...
97 }
98
99 index = directives.indexOf("noindex");
100 if (index >= 0) {
101 metaTags.setNoIndex();
102 }
103
104 index = directives.indexOf("nofollow");
105 if (index >= 0) {
106 metaTags.setNoFollow();
107 }
108
109 index = directives.indexOf("noarchive");
110 if (index >= 0) {
111 metaTags.setNoCache();
112 }
113 }
114
115 } // end if (name == robots)
116 }
117 }
118
119 if (equivNode != null) {
120 if (contentNode != null) {
121 String name = equivNode.getNodeValue().toLowerCase();
122 String content = contentNode.getNodeValue();
123 metaTags.getHttpEquivTags().setProperty(name, content);
124 if ("pragma".equals(name)) {
125 content = content.toLowerCase();
126 int index = content.indexOf("no-cache");
127 if (index >= 0)
128 metaTags.setNoCache();
129 } else if ("refresh".equals(name)) {
130 int idx = content.indexOf(';');
131 String time = null;
132 if (idx == -1) { // just the refresh time
133 time = content;
134 } else time = content.substring(0, idx);
135 try {
136 metaTags.setRefreshTime(Integer.parseInt(time));
137 // skip this if we couldn't parse the time
138 metaTags.setRefresh(true);
139 } catch (Exception e) {
140 ;
141 }
142 URL refreshUrl = null;
143 if (metaTags.getRefresh() && idx != -1) { // set the URL
144 idx = content.toLowerCase().indexOf("url=");
145 if (idx == -1) { // assume a mis-formatted entry with just the url
146 idx = content.indexOf(';') + 1;
147 } else idx += 4;
148 if (idx != -1) {
149 String url = content.substring(idx);
150 try {
151 refreshUrl = new URL(url);
152 } catch (Exception e) {
153 // XXX according to the spec, this has to be an absolute
154 // XXX url. However, many websites use relative URLs and
155 // XXX expect browsers to handle that.
156 // XXX Unfortunately, in some cases this may create a
157 // XXX infinitely recursive paths (a crawler trap)...
158 // if (!url.startsWith("/")) url = "/" + url;
159 try {
160 refreshUrl = new URL(currURL, url);
161 } catch (Exception e1) {
162 refreshUrl = null;
163 }
164 }
165 }
166 }
167 if (metaTags.getRefresh()) {
168 if (refreshUrl == null) {
169 // apparently only refresh time was present. set the URL
170 // to the same URL.
171 refreshUrl = currURL;
172 }
173 metaTags.setRefreshHref(refreshUrl);
174 }
175 }
176 }
177 }
178
179 } else if ("base".equalsIgnoreCase(node.getNodeName())) {
180 NamedNodeMap attrs = node.getAttributes();
181 Node hrefNode = attrs.getNamedItem("href");
182
183 if (hrefNode != null) {
184 String urlString = hrefNode.getNodeValue();
185
186 URL url = null;
187 try {
188 if (currURL == null)
189 url = new URL(urlString);
190 else
191 url = new URL(currURL, urlString);
192 } catch (Exception e) {
193 ;
194 }
195
196 if (url != null)
197 metaTags.setBaseHref(url);
198 }
199
200 }
201
202 }
203
204 NodeList children = node.getChildNodes();
205 if (children != null) {
206 int len = children.getLength();
207 for (int i = 0; i < len; i++) {
208 getMetaTagsHelper(metaTags, children.item(i), currURL);
209 }
210 }
211 }
212
213 }