Source code: com/telefonicasoluciones/search/server/parser/html/HTMLParser.java
1 package com.telefonicasoluciones.search.server.parser.html;
2
3 import java.io.Reader;
4 import java.io.StringReader;
5 import java.util.*;
6 import com.telefonicasoluciones.search.server.parser.xml.*;
7
8 public class HTMLParser {
9 private ArrayList tokens;
10 private ArrayList content;
11 private ArrayList links;
12 private HashMap meta;
13 private String title;
14
15 public HTMLParser(String stringContent) throws HTMLParseException {
16 tokens = new ArrayList();
17 content = new ArrayList();
18 links = new ArrayList();
19 meta = new HashMap();
20 try {
21 parse(stringContent.toCharArray());
22 } catch(Exception e) {
23 throw new HTMLParseException(e.getMessage());
24 }
25 }
26 public StringBuffer getBufferContent() {
27 StringBuffer sb = new StringBuffer();
28 for(int i = 0; i < content.size(); i++)
29 sb.append(String.valueOf((String) content.get(i)).concat(" "));
30 return sb;
31 }
32 public Reader getContentReader() {
33 StringBuffer sb = new StringBuffer();
34 for(int i = 0; i < content.size(); i++)
35 sb.append(String.valueOf((String) content.get(i)).concat(" "));
36 Reader reader = new StringReader(sb.toString());
37 return reader;
38 }
39 public ArrayList getLinks() {
40 return links;
41 }
42 public String getTitle() {
43 return title;
44 }
45 public ArrayList getTokens() {
46 return tokens;
47 }
48 public HashMap getMeta() {
49 return meta;
50 }
51 private int indexOf(char c, char array[], int start) throws Exception {
52 for(int i = start; i < array.length; i++) {
53 if(array[i] == c) {
54 return i;
55 }
56 }
57 return -1;
58 }
59 private char[] substring(char array[], int start, int end) throws Exception {
60 char string[] = new char[(end-start)+1];
61 int j = 0;
62 for(int i = start; i < end && i < array.length; i++) {
63 string[j] = array[i];
64 j++;
65 }
66 return string;
67 }
68 private void parse(char[] data) throws HTMLParseException {
69 char separator = '>';
70 boolean isTitle = false;
71 boolean isCode = false;
72 String token = new String();
73 int offset = -1;
74 int index = -1;
75 try {
76 while(true) {
77 offset = index + 1;
78 index = indexOf(separator, data, offset);
79 if(index>0) {
80 if(index < data.length)
81 token = String.valueOf(substring(data, offset, index));
82 } else {
83 break;
84 }
85 token = token.trim().intern();
86 if(separator == '<') {
87 if(token.length() > 0) {
88 TextToken tt = new TextToken();
89 tt.setText(token);
90 if(isTitle) {
91 isTitle = false;
92 title = tt.getText();
93 }
94 if(!isCode) {
95 if(tt.getText().length() > 0) {
96 content.add(tt.getText());
97 }
98 tokens.add(tt.getText());
99 }
100 }
101 isCode = false;
102 } else {
103 TagToken tt = new TagToken(token);
104 if(tt.toString().toLowerCase().indexOf("href") > 0 && tt.toString().indexOf("!-") == -1)
105 if(tt.getAttribute("href")!=null) {
106 links.add(tt.getAttribute("href"));
107 }
108 if(tt.toString().toLowerCase().startsWith("<frame"))
109 if(tt.getAttribute("src")!=null) {
110 links.add(tt.getAttribute("src"));
111 }
112 if(tt.toString().toLowerCase().indexOf("meta") != -1 && tt.toString().indexOf("!-") == -1) {
113 if(tt.getAttribute("meta")!=null&&tt.getAttribute("value")!=null) {
114 meta.put(tt.getAttribute("name"),tt.getAttribute("value"));
115 }
116 }
117 if(tt.toString().toLowerCase().startsWith("<title"))
118 isTitle = true;
119 if(tt.toString().toLowerCase().startsWith("<script"))
120 isCode = true;
121 if(tt.toString().toLowerCase().startsWith("<style"))
122 isCode = true;
123 tokens.add("<"+tt.toString()+">");
124 }
125 if(separator == '<')
126 separator = '>';
127 else
128 separator = '<';
129 }
130 } catch(Exception e) {
131 throw new HTMLParseException(e.getMessage());
132 }
133 }
134 }