Source code: org/htmlparser/scanners/LinkScanner.java
1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/LinkScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $
2 /*
3 * ====================================================================
4 * Copyright 2002-2004 The Apache Software Foundation.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20 // The developers of JMeter and Apache are greatful to the developers
21 // of HTMLParser for giving Apache Software Foundation a non-exclusive
22 // license. The performance benefits of HTMLParser are clear and the
23 // users of JMeter will benefit from the hard work the HTMLParser
24 // team. For detailed information about HTMLParser, the project is
25 // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26 //
27 // HTMLParser was originally created by Somik Raha in 2000. Since then
28 // a healthy community of users has formed and helped refine the
29 // design so that it is able to tackle the difficult task of parsing
30 // dirty HTML. Derrick Oswald is the current lead developer and was kind
31 // enough to assist JMeter.
32
33 package org.htmlparser.scanners;
34
35 //////////////////
36 // Java Imports //
37 //////////////////
38 import java.util.Hashtable;
39
40 import org.htmlparser.tags.LinkTag;
41 import org.htmlparser.tags.Tag;
42 import org.htmlparser.tags.data.CompositeTagData;
43 import org.htmlparser.tags.data.LinkData;
44 import org.htmlparser.tags.data.TagData;
45 import org.htmlparser.util.LinkProcessor;
46 import org.htmlparser.util.ParserException;
47 import org.htmlparser.util.ParserUtils;
48
49 /**
50 * Scans for the Link Tag. This is a subclass of TagScanner, and is called using a
51 * variant of the template method. If the evaluate() method returns true, that means the
52 * given string contains an image tag. Extraction is done by the scan method thereafter
53 * by the user of this class.
54 */
55 public class LinkScanner extends CompositeTagScanner
56 {
57 private static final String MATCH_NAME[] = { "A" };
58 public static final String LINK_SCANNER_ID = "A";
59 public static final String DIRTY_TAG_MESSAGE =
60 " is a dirty link tag - the tag was not closed. \nWe encountered an open tag, before the previous end tag was found.\nCorrecting this..";
61 private LinkProcessor processor;
62 private final static String ENDERS[] =
63 { "TD", "TR", "FORM", "LI", "BODY", "HTML" };
64 private final static String ENDTAG_ENDERS[] =
65 { "TD", "TR", "FORM", "LI", "BODY", "HTML" };
66
67 /**
68 * Overriding the default constructor
69 */
70 public LinkScanner()
71 {
72 this("");
73 }
74
75 /**
76 * Overriding the constructor to accept the filter
77 */
78 public LinkScanner(String filter)
79 {
80 super(filter, MATCH_NAME, ENDERS, ENDTAG_ENDERS, false);
81 processor = new LinkProcessor();
82 }
83
84 public Tag createTag(TagData tagData, CompositeTagData compositeTagData)
85 throws ParserException
86 {
87
88 String link =
89 extractLink(
90 compositeTagData.getStartTag(),
91 tagData.getUrlBeingParsed());
92 int mailto = link.indexOf("mailto");
93 boolean mailLink = false;
94 if (mailto == 0)
95 {
96 // yes it is
97 mailto = link.indexOf(":");
98 link = link.substring(mailto + 1);
99 mailLink = true;
100 }
101 int javascript = link.indexOf("javascript:");
102 boolean javascriptLink = false;
103 if (javascript == 0)
104 {
105 link = link.substring(11);
106 // this magic number is "javascript:".length()
107 javascriptLink = true;
108 }
109 String accessKey = getAccessKey(compositeTagData.getStartTag());
110 String myLinkText = compositeTagData.getChildren().toString();
111
112 LinkTag linkTag =
113 new LinkTag(
114 tagData,
115 compositeTagData,
116 new LinkData(
117 link,
118 myLinkText,
119 accessKey,
120 mailLink,
121 javascriptLink));
122 linkTag.setThisScanner(this);
123 return linkTag;
124 }
125
126 /**
127 * Template Method, used to decide if this scanner can handle the Link tag type. If
128 * the evaluation returns true, the calling side makes a call to scan().
129 * @param s The complete text contents of the Tag.
130 * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current
131 * scan has begun, and hence allows us to write scanners that can work with dirty html
132 */
133 public boolean evaluate(String s, TagScanner previousOpenScanner)
134 {
135 char ch;
136 boolean ret;
137
138 // eat up leading blanks
139 s = absorbLeadingBlanks(s);
140 if (5 > s.length())
141 ret = false;
142 else
143 {
144 ch = s.charAt(0);
145 if ((ch == 'a' || ch == 'A')
146 && Character.isWhitespace(s.charAt(1)))
147 ret = -1 != s.toUpperCase().indexOf("HREF");
148 else
149 ret = false;
150 }
151
152 return (ret);
153 }
154
155 /**
156 * Extract the link from the given string. The URL of the actual html page is also
157 * provided.
158 */
159 public String extractLink(Tag tag, String url) throws ParserException
160 {
161 try
162 {
163 Hashtable table = tag.getAttributes();
164 String relativeLink = (String) table.get("HREF");
165 if (relativeLink != null)
166 {
167 relativeLink = ParserUtils.removeChars(relativeLink, '\n');
168 relativeLink = ParserUtils.removeChars(relativeLink, '\r');
169 }
170 return processor.extract(relativeLink, url);
171 }
172 catch (Exception e)
173 {
174 String msg;
175 if (tag != null)
176 msg = tag.getText();
177 else
178 msg = "null";
179 throw new ParserException(
180 "HTMLLinkScanner.extractLink() : Error while extracting link from tag "
181 + msg
182 + ", url = "
183 + url,
184 e);
185 }
186 }
187
188 /**
189 * Extract the access key from the given tag.
190 * @param text Text to be parsed to pick out the access key.
191 * @return The value of the ACCESSKEY attribute.
192 */
193 private String getAccessKey(Tag tag)
194 {
195 return tag.getAttribute("ACCESSKEY");
196 }
197
198 public BaseHrefScanner createBaseHREFScanner(String filter)
199 {
200 return new BaseHrefScanner(filter, processor);
201 }
202
203 public ImageScanner createImageScanner(String filter)
204 {
205 return new ImageScanner(filter, processor);
206 }
207
208 /**
209 * @see org.htmlparser.scanners.TagScanner#getID()
210 */
211 public String[] getID()
212 {
213 return MATCH_NAME;
214 }
215
216 }