Save This Page
Home » sitemesh-2.3 » com.opensymphony.module » sitemesh » mapper » [javadoc | source]
    1   /*
    2    * Title:        RobotDecoratorMapper
    3    * Description:
    4    *
    5    * This software is published under the terms of the OpenSymphony Software
    6    * License version 1.1, of which a copy has been included with this
    7    * distribution in the LICENSE.txt file.
    8    */
    9   
   10   package com.opensymphony.module.sitemesh.mapper;
   11   
   12   import com.opensymphony.module.sitemesh.Config;
   13   import com.opensymphony.module.sitemesh.Decorator;
   14   import com.opensymphony.module.sitemesh.DecoratorMapper;
   15   import com.opensymphony.module.sitemesh.Page;
   16   import com.opensymphony.module.sitemesh.RequestConstants;
   17   
   18   import javax.servlet.http.HttpServletRequest;
   19   import javax.servlet.http.HttpSession;
   20   import java.util.Properties;
   21   
   22   /**
   23    * The RobotDecoratorMapper will use the specified decorator when the requester
   24    * is identified as a robot (also known as spider, crawler, ferret) of a search engine.
   25    *
   26    * <p>The name of this decorator should be supplied in the <code>decorator</code>
   27    * property.</p>
   28    *
   29    * @author <a href="mailto:pathos@pandora.be">Mathias Bogaert</a>
   30    * @version $Revision: 1.2 $
   31    *
   32    * @see com.opensymphony.module.sitemesh.DecoratorMapper
   33    */
   34   public final class RobotDecoratorMapper extends AbstractDecoratorMapper {
   35       private String decoratorName = null;
   36   
   37       /** All known robot hosts (list can be found <a href="http://www.spiderhunter.com">here</a>). */
   38       private static final String[] botHosts = {"alltheweb.com", "alta-vista.net", "altavista.com",
   39                                                 "atext.com", "euroseek.net", "excite.com",
   40                                                 "fast-search.net", "google.com", "googlebot.com",
   41                                                 "infoseek.co.jp", "infoseek.com", "inktomi.com",
   42                                                 "inktomisearch.com", "linuxtoday.com.au", "lycos.com",
   43                                                 "lycos.com", "northernlight.com", "pa-x.dec.com"};
   44   
   45       /**
   46        * All known robot user-agent headers (list can be found
   47        * <a href="http://www.robotstxt.org/wc/active.html">here</a>).
   48        *
   49        * <p>NOTE: To avoid bad detection:</p>
   50        *
   51        * <ul>
   52        *  <li>Robots with ID of 2 letters only were removed</li>
   53        *  <li>Robot called "webs" were removed</li>
   54        *  <li>directhit was changed in direct_hit (its real id)</li>
   55        * </ul>
   56        */
   57       private static final String[] botAgents = {
   58           "acme.spider", "ahoythehomepagefinder", "alkaline", "appie", "arachnophilia",
   59           "architext", "aretha", "ariadne", "aspider", "atn.txt", "atomz", "auresys",
   60           "backrub", "bigbrother", "bjaaland", "blackwidow", "blindekuh", "bloodhound",
   61           "brightnet", "bspider", "cactvschemistryspider", "calif", "cassandra",
   62           "cgireader", "checkbot", "churl", "cmc", "collective", "combine", "conceptbot",
   63           "core", "cshkust", "cusco", "cyberspyder", "deweb", "dienstspider", "diibot",
   64           "direct_hit", "dnabot", "download_express", "dragonbot", "dwcp", "ebiness",
   65           "eit", "emacs", "emcspider", "esther", "evliyacelebi", "fdse", "felix",
   66           "ferret", "fetchrover", "fido", "finnish", "fireball", "fish", "fouineur",
   67           "francoroute", "freecrawl", "funnelweb", "gazz", "gcreep", "getbot", "geturl",
   68           "golem", "googlebot", "grapnel", "griffon", "gromit", "gulliver", "hambot",
   69           "harvest", "havindex", "hometown", "wired-digital", "htdig", "htmlgobble",
   70           "hyperdecontextualizer", "ibm", "iconoclast", "ilse", "imagelock", "incywincy",
   71           "informant", "infoseek", "infoseeksidewinder", "infospider", "inspectorwww",
   72           "intelliagent", "iron33", "israelisearch", "javabee", "jcrawler", "jeeves",
   73           "jobot", "joebot", "jubii", "jumpstation", "katipo", "kdd", "kilroy",
   74           "ko_yappo_robot", "labelgrabber.txt", "larbin", "legs", "linkscan",
   75           "linkwalker", "lockon", "logo_gif", "lycos", "macworm", "magpie", "mediafox",
   76           "merzscope", "meshexplorer", "mindcrawler", "moget", "momspider", "monster",
   77           "motor", "muscatferret", "mwdsearch", "myweb", "netcarta", "netmechanic",
   78           "netscoop", "newscan-online", "nhse", "nomad", "northstar", "nzexplorer",
   79           "occam", "octopus", "orb_search", "packrat", "pageboy", "parasite", "patric",
   80           "perignator", "perlcrawler", "phantom", "piltdownman", "pioneer", "pitkow",
   81           "pjspider", "pka", "plumtreewebaccessor", "poppi", "portalb", "puu", "python",
   82           "raven", "rbse", "resumerobot", "rhcs", "roadrunner", "robbie", "robi",
   83           "roverbot", "safetynetrobot", "scooter", "search_au", "searchprocess",
   84           "senrigan", "sgscout", "shaggy", "shaihulud", "sift", "simbot", "site-valet",
   85           "sitegrabber", "sitetech", "slurp", "smartspider", "snooper", "solbot",
   86           "spanner", "speedy", "spider_monkey", "spiderbot", "spiderman", "spry",
   87           "ssearcher", "suke", "sven", "tach_bw", "tarantula", "tarspider", "tcl",
   88           "techbot", "templeton", "titin", "titan", "tkwww", "tlspider", "ucsd",
   89           "udmsearch", "urlck", "valkyrie", "victoria", "visionsearch", "voyager",
   90           "vwbot", "w3index", "w3m2", "wanderer", "webbandit", "webcatcher", "webcopy",
   91           "webfetcher", "webfoot", "weblayers", "weblinker", "webmirror", "webmoose",
   92           "webquest", "webreader", "webreaper", "websnarf", "webspider", "webvac",
   93           "webwalk", "webwalker", "webwatch", "wget", "whowhere", "wmir", "wolp",
   94           "wombat", "worm", "wwwc", "wz101", "xget", "nederland.zoek"
   95       };
   96   
   97       public void init(Config config, Properties properties, DecoratorMapper parent) throws InstantiationException {
   98           super.init(config, properties, parent);
   99           decoratorName = properties.getProperty("decorator");
  100       }
  101   
  102       public Decorator getDecorator(HttpServletRequest request, Page page) {
  103           Decorator result = null;
  104   
  105           if (decoratorName != null && isBot(request)) {
  106               result = getNamedDecorator(request, decoratorName);
  107           }
  108   
  109           return result == null ? super.getDecorator(request, page) : result;
  110       }
  111   
  112       /** Check if the current request came from  a robot (also known as spider, crawler, ferret) */
  113       private static boolean isBot(HttpServletRequest request) {
  114           if (request == null) return false;
  115   
  116           // force creation of a session
  117           HttpSession session = request.getSession(true);
  118   
  119           if (Boolean.FALSE.equals(session.getAttribute(RequestConstants.ROBOT))) {
  120               return false;
  121           }
  122           else if (Boolean.TRUE.equals(session.getAttribute(RequestConstants.ROBOT))) {
  123               // a key was found in the session indicating it is a robot
  124               return true;
  125           }
  126           else {
  127               if ("robots.txt".indexOf(request.getRequestURI()) != -1) {
  128                   // there is a specific request for the robots.txt file, so we assume
  129                   // it must be a robot (only robots request robots.txt)
  130   
  131                   // set a key in the session, so the next time we don't have to manually
  132                   // detect the robot again
  133                   session.setAttribute(RequestConstants.ROBOT, Boolean.TRUE);
  134                   return true;
  135               }
  136               else {
  137                   String userAgent = request.getHeader("User-Agent");
  138   
  139                   if (userAgent != null && userAgent.trim().length() > 2) {
  140                       // first check for common user-agent headers, so that we can speed
  141                       // this thing up, hopefully clever spiders will not send a fake header
  142                       if (userAgent.indexOf("MSIE")      != -1 || userAgent.indexOf("Gecko")   != -1    // MSIE and Mozilla
  143                        || userAgent.indexOf("Opera")     != -1 || userAgent.indexOf("iCab")    != -1    // Opera and iCab (mac browser)
  144                        || userAgent.indexOf("Konqueror") != -1 || userAgent.indexOf("KMeleon") != -1    // Konqueror and KMeleon
  145                        || userAgent.indexOf("4.7")       != -1 || userAgent.indexOf("Lynx")    != -1) { // NS 4.78 and Lynx
  146                           // indicate this session is not a robot
  147                           session.setAttribute(RequestConstants.ROBOT, Boolean.FALSE);
  148                           return false;
  149                       }
  150   
  151                       for (int i = 0; i < botAgents.length; i++) {
  152                           if (userAgent.indexOf(botAgents[i]) != -1) {
  153                               // set a key in the session, so the next time we don't have to manually
  154                               // detect the robot again
  155                               session.setAttribute(RequestConstants.ROBOT, Boolean.TRUE);
  156                               return true;
  157                           }
  158                       }
  159                   }
  160   
  161                   // detect the robot from the host or user-agent
  162                   String remoteHost = request.getRemoteHost(); // requires one DNS lookup
  163   
  164                   // if the DNS server didn't return a hostname, getRemoteHost returns the
  165                   // IP address, which is ignored here (the last char is checked, because some
  166                   // remote hosts begin with the IP)
  167                   if (remoteHost != null && remoteHost.length() > 0 && remoteHost.charAt(remoteHost.length() - 1) > 64) {
  168                       for (int i = 0; i < botHosts.length; i++) {
  169                           if (remoteHost.indexOf(botHosts[i]) != -1) {
  170                               // set a key in the session, so the next time we don't have to manually
  171                               // detect the robot again
  172                               session.setAttribute(RequestConstants.ROBOT, Boolean.TRUE);
  173                               return true;
  174                           }
  175                       }
  176                   }
  177   
  178                   // remote host and user agent are not in the predefined list,
  179                   // so it must be an unknown robot or not a robot
  180   
  181                   // indicate this session is not a robot
  182                   session.setAttribute(RequestConstants.ROBOT, Boolean.FALSE);
  183                   return false;
  184               }
  185           }
  186       }
  187   }

Save This Page
Home » sitemesh-2.3 » com.opensymphony.module » sitemesh » mapper » [javadoc | source]