Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/htmlparser/scanners/LinkScanner.java


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/LinkScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  package org.htmlparser.scanners;
34  
35  //////////////////
36  // Java Imports //
37  //////////////////
38  import java.util.Hashtable;
39  
40  import org.htmlparser.tags.LinkTag;
41  import org.htmlparser.tags.Tag;
42  import org.htmlparser.tags.data.CompositeTagData;
43  import org.htmlparser.tags.data.LinkData;
44  import org.htmlparser.tags.data.TagData;
45  import org.htmlparser.util.LinkProcessor;
46  import org.htmlparser.util.ParserException;
47  import org.htmlparser.util.ParserUtils;
48  
49  /**
50   * Scans for the Link Tag. This is a subclass of TagScanner, and is called using a 
51   * variant of the template method. If the evaluate() method returns true, that means the
52   * given string contains an image tag. Extraction is done by the scan method thereafter
53   * by the user of this class.
54   */
55  public class LinkScanner extends CompositeTagScanner
56  {
57      private static final String MATCH_NAME[] = { "A" };
58      public static final String LINK_SCANNER_ID = "A";
59      public static final String DIRTY_TAG_MESSAGE =
60          " is a dirty link tag - the tag was not closed. \nWe encountered an open tag, before the previous end tag was found.\nCorrecting this..";
61      private LinkProcessor processor;
62      private final static String ENDERS[] =
63          { "TD", "TR", "FORM", "LI", "BODY", "HTML" };
64      private final static String ENDTAG_ENDERS[] =
65          { "TD", "TR", "FORM", "LI", "BODY", "HTML" };
66  
67      /**
68       * Overriding the default constructor
69       */
70      public LinkScanner()
71      {
72          this("");
73      }
74  
75      /**
76       * Overriding the constructor to accept the filter 
77       */
78      public LinkScanner(String filter)
79      {
80          super(filter, MATCH_NAME, ENDERS, ENDTAG_ENDERS, false);
81          processor = new LinkProcessor();
82      }
83  
84      public Tag createTag(TagData tagData, CompositeTagData compositeTagData)
85          throws ParserException
86      {
87  
88          String link =
89              extractLink(
90                  compositeTagData.getStartTag(),
91                  tagData.getUrlBeingParsed());
92          int mailto = link.indexOf("mailto");
93          boolean mailLink = false;
94          if (mailto == 0)
95          {
96              // yes it is
97              mailto = link.indexOf(":");
98              link = link.substring(mailto + 1);
99              mailLink = true;
100         }
101         int javascript = link.indexOf("javascript:");
102         boolean javascriptLink = false;
103         if (javascript == 0)
104         {
105             link = link.substring(11);
106             // this magic number is "javascript:".length()
107             javascriptLink = true;
108         }
109         String accessKey = getAccessKey(compositeTagData.getStartTag());
110         String myLinkText = compositeTagData.getChildren().toString();
111 
112         LinkTag linkTag =
113             new LinkTag(
114                 tagData,
115                 compositeTagData,
116                 new LinkData(
117                     link,
118                     myLinkText,
119                     accessKey,
120                     mailLink,
121                     javascriptLink));
122         linkTag.setThisScanner(this);
123         return linkTag;
124     }
125 
126     /**
127      * Template Method, used to decide if this scanner can handle the Link tag type. If 
128      * the evaluation returns true, the calling side makes a call to scan().
129      * @param s The complete text contents of the Tag.
130      * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current
131      * scan has begun, and hence allows us to write scanners that can work with dirty html
132      */
133     public boolean evaluate(String s, TagScanner previousOpenScanner)
134     {
135         char ch;
136         boolean ret;
137 
138         // eat up leading blanks
139         s = absorbLeadingBlanks(s);
140         if (5 > s.length())
141             ret = false;
142         else
143         {
144             ch = s.charAt(0);
145             if ((ch == 'a' || ch == 'A')
146                 && Character.isWhitespace(s.charAt(1)))
147                 ret = -1 != s.toUpperCase().indexOf("HREF");
148             else
149                 ret = false;
150         }
151 
152         return (ret);
153     }
154 
155     /**
156      * Extract the link from the given string. The URL of the actual html page is also 
157      * provided.    
158      */
159     public String extractLink(Tag tag, String url) throws ParserException
160     {
161         try
162         {
163             Hashtable table = tag.getAttributes();
164             String relativeLink = (String) table.get("HREF");
165             if (relativeLink != null)
166             {
167                 relativeLink = ParserUtils.removeChars(relativeLink, '\n');
168                 relativeLink = ParserUtils.removeChars(relativeLink, '\r');
169             }
170             return processor.extract(relativeLink, url);
171         }
172         catch (Exception e)
173         {
174             String msg;
175             if (tag != null)
176                 msg = tag.getText();
177             else
178                 msg = "null";
179             throw new ParserException(
180                 "HTMLLinkScanner.extractLink() : Error while extracting link from tag "
181                     + msg
182                     + ", url = "
183                     + url,
184                 e);
185         }
186     }
187 
188     /**
189      * Extract the access key from the given tag.
190      * @param text Text to be parsed to pick out the access key.
191      * @return The value of the ACCESSKEY attribute.
192      */
193     private String getAccessKey(Tag tag)
194     {
195         return tag.getAttribute("ACCESSKEY");
196     }
197 
198     public BaseHrefScanner createBaseHREFScanner(String filter)
199     {
200         return new BaseHrefScanner(filter, processor);
201     }
202 
203     public ImageScanner createImageScanner(String filter)
204     {
205         return new ImageScanner(filter, processor);
206     }
207 
208     /**
209      * @see org.htmlparser.scanners.TagScanner#getID()
210      */
211     public String[] getID()
212     {
213         return MATCH_NAME;
214     }
215 
216 }