1 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Copyright 2008 Ledermueller Achim
4 * serc is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
18 import java
.util
.regex
.Pattern
;
19 import java
.util
.regex
.Matcher
;
20 import java
.util
.Vector
;
25 private Pattern pattern
;
26 private ResultSet rSet
= new ResultSet();
27 private Result result
= new Result();
29 public ResultSet
extractData(String searchEngine
, String page
) {
30 if(searchEngine
.equals(Fetcher
.searchEngines
[0]) || searchEngine
.equals(Fetcher
.searchEngines
[2])) {
31 pattern
= Pattern
.compile("\"\\s*(http|ftp)s?://.*?\"");
32 } else if(searchEngine
.equals(Fetcher
.searchEngines
[1])) {
33 pattern
= Pattern
.compile("<link>.*?</link>");
34 } else if(searchEngine
.equals(Fetcher
.searchEngines
[3])) {
35 page
= page
.replaceAll("%3A",":");
36 page
= page
.replaceAll("%3B",";");
37 page
= page
.replaceAll("%3D","=");
38 page
= page
.replaceAll("%3F","?");
39 page
= page
.replaceAll("%23","#");
40 page
= page
.replaceAll("%22","\"");
41 page
= page
.replaceAll("%25","%");
42 page
= page
.replaceAll("%27","'");
43 page
= page
.replaceAll("%2B","+");
44 page
= page
.replaceAll("%26","&");
45 page
= page
.replaceAll("%2F","/");
46 page
= page
.replaceAll("%7E","~");
47 pattern
= Pattern
.compile("\\?src=web&url=(http|ftp)s?://.*?\"");
48 } else if(searchEngine
.equals(Fetcher
.searchEngines
[4])) {
49 pattern
= Pattern
.compile("\"\\s*http://eprints\\.rclis\\.org/archive/.*?\"");
50 } else if(searchEngine
.equals("sc")) {
51 pattern
= Pattern
.compile("\"\\s*(http)s?://.*?\"");
52 } else if(searchEngine
.equals("elis")) {
53 //there is also a alternative location - later
54 pattern
= Pattern
.compile("\"\\s*http://eprints\\.rclis\\.org/archive/.*?\\.pdf\"");
56 pattern
= Pattern
.compile("<link>.*?</link>");
59 Matcher matcher
= pattern
.matcher(page
);
62 //first if-else-clause the while would be better
63 while(matcher
.find()) {
64 if(searchEngine
.equals(Fetcher
.searchEngines
[0])) {
67 (matcher
.group().indexOf("scholar?") == -1 &&
68 matcher
.group().indexOf("http://google.com/search?") == -1 &&
69 matcher
.group().indexOf("http://www.google.com/search?") == -1 &&
70 matcher
.group().indexOf("http://images.google.com/images?") == -1 &&
71 matcher
.group().indexOf("http://video.google.com/videosearch?") == -1 &&
72 matcher
.group().indexOf("http://news.google.com/news?") == -1 &&
73 matcher
.group().indexOf("http://maps.google.com/maps?") == -1 &&
74 matcher
.group().indexOf("http://google.com/search?") == -1 &&
75 matcher
.group().indexOf("http://www.google.com/webhp?") == -1 &&
76 matcher
.group().indexOf("http://www.google.com/webhp?") == -1 &&
77 matcher
.group().indexOf("worldcat.org") == -1) &&
78 !matcher
.group().equals("http://www.google.com/intl/en/about.html")
81 rSet
.addResult(new Result(new URL(matcher
.group().replace('"', ' ').trim())));
83 } else if(searchEngine
.equals(Fetcher
.searchEngines
[1])){
85 String str
= matcher
.group().replaceAll("<link>","");
86 str
= str
.replaceAll("</link>","");
87 if(str
.indexOf("opensearch") == -1) {
88 Fetcher f
= new Fetcher();
89 Parser p
= new Parser();
90 str
= f
.getPage(str
, "", 0, true);
91 rSet
.addAll(p
.extractData("sc", str
));
94 } else if(searchEngine
.equals(Fetcher
.searchEngines
[2])) {
97 matcher
.group().indexOf("quod.lib.umich.edu") == -1 &&
98 matcher
.group().indexOf("google-analytics.com") == -1 &&
99 matcher
.group().indexOf("www.oaister.org") == -1 &&
100 matcher
.group().indexOf("openarchives.org") == -1 &&
101 matcher
.group().indexOf("www.lib.umich.edu/opensearch/oaister.xml") == -1 &&
102 matcher
.group().equals("\"http://www.umdl.umich.edu/\"") == false &&
103 matcher
.group().equals("\"http://www.umich.edu/\"") == false
106 rSet
.addResult(new Result(new URL(matcher
.group().replace('"', ' ').trim())));
108 } else if(searchEngine
.equals(Fetcher
.searchEngines
[3])) {
109 if(matcher
.group().indexOf("mail.elsevier-alerts.com") == -1) {
110 rSet
.addResult(new Result(new URL(matcher
.group().replace('"', ' ').trim().replaceAll("\\?src=web&url=", ""))));
112 } else if(searchEngine
.equals(Fetcher
.searchEngines
[4])) {
114 String str
= matcher
.group().replace('"', ' ').trim();
115 Fetcher f
= new Fetcher();
116 Parser p
= new Parser();
117 str
= f
.getPage(str
, "", 0, false);
118 rSet
.addAll(p
.extractData("elis", str
));
120 } else if(searchEngine
.equals("sc")) {
122 matcher
.group().indexOf("scientificcommons.org") == -1 &&
123 matcher
.group().indexOf("www.w3.org") == -1 &&
124 matcher
.group().indexOf("google") == -1
128 result
.addUrl(new URL(matcher
.group().replace('"', ' ').trim()));
131 } else if(searchEngine
.equals("elis")) {
132 result
.addUrl(new URL(matcher
.group().replace('"', ' ').trim()));
136 //if a result with more than one url was needed (scientificcommons)
137 if(!result
.isEmpty())
138 rSet
.addResult(result
);
140 }catch(Exception e
) {
141 System
.err
.println(e
);