nothing...
[serc.git] / Parser.java
blobe29e6c6e5f6f6c0c781f1a373bd8228ba12b30c4
1 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Copyright 2008 Ledermueller Achim
4 * serc is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
18 import java.util.regex.Pattern;
19 import java.util.regex.Matcher;
20 import java.util.Vector;
21 import java.net.URL;
23 class Parser {
25 private Pattern pattern;
26 private ResultSet rSet = new ResultSet();
27 private Result result = new Result();
29 public ResultSet extractData(String searchEngine, String page) {
30 if(searchEngine.equals(Fetcher.searchEngines[0]) || searchEngine.equals(Fetcher.searchEngines[2])) {
31 pattern = Pattern.compile("\"\\s*(http|ftp)s?://.*?\"");
32 } else if(searchEngine.equals(Fetcher.searchEngines[1])) {
33 pattern = Pattern.compile("<link>.*?</link>");
34 } else if(searchEngine.equals(Fetcher.searchEngines[3])) {
35 page = page.replaceAll("%3A",":");
36 page = page.replaceAll("%3B",";");
37 page = page.replaceAll("%3D","=");
38 page = page.replaceAll("%3F","?");
39 page = page.replaceAll("%23","#");
40 page = page.replaceAll("%22","\"");
41 page = page.replaceAll("%25","%");
42 page = page.replaceAll("%27","'");
43 page = page.replaceAll("%2B","+");
44 page = page.replaceAll("%26","&");
45 page = page.replaceAll("%2F","/");
46 page = page.replaceAll("%7E","~");
47 pattern = Pattern.compile("\\?src=web&url=(http|ftp)s?://.*?\"");
48 } else if(searchEngine.equals(Fetcher.searchEngines[4])) {
49 pattern = Pattern.compile("\"\\s*http://eprints\\.rclis\\.org/archive/.*?\"");
50 } else if(searchEngine.equals("sc")) {
51 pattern = Pattern.compile("\"\\s*(http)s?://.*?\"");
52 } else if(searchEngine.equals("elis")) {
53 //there is also a alternative location - later
54 pattern = Pattern.compile("\"\\s*http://eprints\\.rclis\\.org/archive/.*?\\.pdf\"");
55 } else {
56 pattern = Pattern.compile("<link>.*?</link>");
59 Matcher matcher = pattern.matcher(page);
61 try{
62 //first if-else-clause the while would be better
63 while(matcher.find()) {
64 if(searchEngine.equals(Fetcher.searchEngines[0])) {
66 if(
67 (matcher.group().indexOf("scholar?") == -1 &&
68 matcher.group().indexOf("http://google.com/search?") == -1 &&
69 matcher.group().indexOf("http://www.google.com/search?") == -1 &&
70 matcher.group().indexOf("http://images.google.com/images?") == -1 &&
71 matcher.group().indexOf("http://video.google.com/videosearch?") == -1 &&
72 matcher.group().indexOf("http://news.google.com/news?") == -1 &&
73 matcher.group().indexOf("http://maps.google.com/maps?") == -1 &&
74 matcher.group().indexOf("http://google.com/search?") == -1 &&
75 matcher.group().indexOf("http://www.google.com/webhp?") == -1 &&
76 matcher.group().indexOf("http://www.google.com/webhp?") == -1 &&
77 matcher.group().indexOf("worldcat.org") == -1) &&
78 !matcher.group().equals("http://www.google.com/intl/en/about.html")
81 rSet.addResult(new Result(new URL(matcher.group().replace('"', ' ').trim())));
83 } else if(searchEngine.equals(Fetcher.searchEngines[1])){
85 String str = matcher.group().replaceAll("<link>","");
86 str = str.replaceAll("</link>","");
87 if(str.indexOf("opensearch") == -1) {
88 Fetcher f = new Fetcher();
89 Parser p = new Parser();
90 str = f.getPage(str, "", 0, true);
91 rSet.addAll(p.extractData("sc", str));
94 } else if(searchEngine.equals(Fetcher.searchEngines[2])) {
96 if(
97 matcher.group().indexOf("quod.lib.umich.edu") == -1 &&
98 matcher.group().indexOf("google-analytics.com") == -1 &&
99 matcher.group().indexOf("www.oaister.org") == -1 &&
100 matcher.group().indexOf("openarchives.org") == -1 &&
101 matcher.group().indexOf("www.lib.umich.edu/opensearch/oaister.xml") == -1 &&
102 matcher.group().equals("\"http://www.umdl.umich.edu/\"") == false &&
103 matcher.group().equals("\"http://www.umich.edu/\"") == false
106 rSet.addResult(new Result(new URL(matcher.group().replace('"', ' ').trim())));
108 } else if(searchEngine.equals(Fetcher.searchEngines[3])) {
109 if(matcher.group().indexOf("mail.elsevier-alerts.com") == -1) {
110 rSet.addResult(new Result(new URL(matcher.group().replace('"', ' ').trim().replaceAll("\\?src=web&url=", ""))));
112 } else if(searchEngine.equals(Fetcher.searchEngines[4])) {
114 String str = matcher.group().replace('"', ' ').trim();
115 Fetcher f = new Fetcher();
116 Parser p = new Parser();
117 str = f.getPage(str, "", 0, false);
118 rSet.addAll(p.extractData("elis", str));
120 } else if(searchEngine.equals("sc")) {
122 matcher.group().indexOf("scientificcommons.org") == -1 &&
123 matcher.group().indexOf("www.w3.org") == -1 &&
124 matcher.group().indexOf("google") == -1
128 result.addUrl(new URL(matcher.group().replace('"', ' ').trim()));
131 } else if(searchEngine.equals("elis")) {
132 result.addUrl(new URL(matcher.group().replace('"', ' ').trim()));
136 //if a result with more than one url was needed (scientificcommons)
137 if(!result.isEmpty())
138 rSet.addResult(result);
140 }catch(Exception e) {
141 System.err.println(e);
144 return rSet;