Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/telefonicasoluciones/search/server/parser/html/HTMLParser.java


1   package com.telefonicasoluciones.search.server.parser.html;
2   
3   import java.io.Reader;
4   import java.io.StringReader;
5   import java.util.*;
6   import com.telefonicasoluciones.search.server.parser.xml.*;
7   
8   public class HTMLParser {
9       private ArrayList tokens;
10      private ArrayList content;
11      private ArrayList links;
12      private HashMap meta;
13      private String title;
14      
15      public HTMLParser(String stringContent) throws HTMLParseException {
16    tokens = new ArrayList();
17    content = new ArrayList();
18    links = new ArrayList();
19          meta = new HashMap();
20    try {
21              parse(stringContent.toCharArray());
22    } catch(Exception e) {
23              throw new HTMLParseException(e.getMessage());
24    }
25      }
26      public StringBuffer getBufferContent() {
27    StringBuffer sb = new StringBuffer();
28    for(int i = 0; i < content.size(); i++)
29              sb.append(String.valueOf((String) content.get(i)).concat(" "));
30    return sb;
31      }
32      public Reader getContentReader() {
33    StringBuffer sb = new StringBuffer();
34    for(int i = 0; i < content.size(); i++)
35              sb.append(String.valueOf((String) content.get(i)).concat(" "));
36    Reader reader = new StringReader(sb.toString());
37    return reader;
38      }
39      public ArrayList getLinks() {
40    return links;
41      }
42      public String getTitle() {
43    return title;
44      }
45      public ArrayList getTokens() {
46    return tokens;
47      }
48      public HashMap getMeta() {
49    return meta;
50      }
51      private int indexOf(char c, char array[], int start) throws Exception {
52    for(int i = start; i < array.length; i++) {
53              if(array[i] == c) {
54                  return i;
55              }
56    }
57    return -1;
58      }
59      private char[] substring(char array[], int start, int end) throws Exception {
60    char string[] = new char[(end-start)+1];
61          int j = 0;
62          for(int i = start; i < end && i < array.length; i++) {
63              string[j] = array[i];
64              j++;
65    }
66    return string;
67      }
68      private void parse(char[] data) throws HTMLParseException {
69    char separator = '>';
70    boolean isTitle = false;
71    boolean isCode = false;
72    String token = new String();
73    int offset = -1;
74    int index = -1;
75    try {
76              while(true) {
77                  offset = index + 1;
78                  index = indexOf(separator, data, offset);
79                  if(index>0) {
80                      if(index < data.length)
81                          token = String.valueOf(substring(data, offset, index));
82                  } else {
83                      break;
84                  }
85                  token = token.trim().intern();
86                  if(separator == '<') {
87                      if(token.length() > 0) {
88                          TextToken tt = new TextToken();
89                          tt.setText(token);
90                          if(isTitle) {
91                              isTitle = false;
92                              title = tt.getText();
93                          }
94                          if(!isCode) {
95                              if(tt.getText().length() > 0) {
96                                  content.add(tt.getText());
97                              }
98                              tokens.add(tt.getText());
99                          }
100                     }
101                     isCode = false;
102                 } else {
103                     TagToken tt = new TagToken(token);
104                     if(tt.toString().toLowerCase().indexOf("href") > 0 && tt.toString().indexOf("!-") == -1)
105                         if(tt.getAttribute("href")!=null) {
106                             links.add(tt.getAttribute("href"));
107                         }
108                     if(tt.toString().toLowerCase().startsWith("<frame"))
109                         if(tt.getAttribute("src")!=null) {
110                             links.add(tt.getAttribute("src"));
111                         }
112                     if(tt.toString().toLowerCase().indexOf("meta") != -1 && tt.toString().indexOf("!-") == -1) {
113                         if(tt.getAttribute("meta")!=null&&tt.getAttribute("value")!=null) {
114                             meta.put(tt.getAttribute("name"),tt.getAttribute("value"));
115                         }
116                     } 
117                     if(tt.toString().toLowerCase().startsWith("<title"))
118                         isTitle = true;
119                     if(tt.toString().toLowerCase().startsWith("<script"))
120                         isCode = true;
121                     if(tt.toString().toLowerCase().startsWith("<style"))
122                         isCode = true;
123                     tokens.add("<"+tt.toString()+">");
124                 }
125                 if(separator == '<')
126                     separator = '>';
127                 else
128                     separator = '<';
129             }
130   } catch(Exception e) {
131             throw new HTMLParseException(e.getMessage());
132   }
133     }
134 }