Source code: org/embl/ebi/escience/scuflworkers/java/ExtractImageLinks.java
1 /**
2 * This file is a component of the Taverna project,
3 * and is licensed under the GNU LGPL.
4 * Copyright Tom Oinn, EMBL-EBI
5 */
6 package org.embl.ebi.escience.scuflworkers.java;
7
8 import uk.ac.soton.itinnovation.taverna.enactor.entities.TaskExecutionException;
9 import java.util.*;
10 import java.io.*;
11 import java.net.*;
12 import org.embl.ebi.escience.baclava.*;
13
14 /**
15 * Extract a list of all image links in the supplied html document
16 * @author Tom Oinn
17 */
18 public class ExtractImageLinks implements LocalWorker {
19
20 private static final String NEWLINE = System.getProperty("line.separator");
21
22 public String[] inputNames() {
23 return new String[]{"document"};
24 }
25 public String[] inputTypes() {
26 return new String[]{"'text/html'"};
27 }
28 public String[] outputNames() {
29 return new String[]{"imagelinks"};
30 }
31 public String[] outputTypes() {
32 return new String[]{"l('text/x-taverna-web-url')"};
33 }
34
35 /**
36 * Fetch the web page pointed to by the URL supplied as the 'url'
37 * parameter into the service, the 'base' parameter specifies a
38 * URL to use as the base for relative URL resolution.
39 */
40 public Map execute(Map inputs) throws TaskExecutionException {
41 String content = (String)((DataThing)(inputs.get("document"))).getDataObject();
42 String lowerCaseContent = content.toLowerCase();
43 int index = 0;
44 List urlList = new ArrayList();
45 while ((index = lowerCaseContent.indexOf("<img", index)) != -1) {
46 if ((index = lowerCaseContent.indexOf("src", index)) == -1)
47 break;
48 if ((index = lowerCaseContent.indexOf("=", index)) == -1)
49 break;
50 index++;
51 String remaining = content.substring(index);
52 StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
53 String strLink = st.nextToken();
54 urlList.add(strLink);
55 }
56 Map outputs = new HashMap();
57 outputs.put("imagelinks",new DataThing(urlList));
58 return outputs;
59 }
60
61
62 }