nothing...
[serc.git] / Fetcher.java
blob79c2dff17da5764fd04721c148cd22a4505e50aa
1 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Copyright 2008 Ledermueller Achim
4 * serc is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
18 import java.io.*;
19 import java.net.*;
20 import java.util.zip.GZIPInputStream;
22 class Fetcher {
24 private URL url;
25 private String host, filename;
26 private int port, bytes_read;
27 private byte[] buffer;
28 private InputStream from_server;
29 private StringBuffer output;
30 private PrintWriter to_server;
31 private boolean goOn;
32 private int page;
34 public static String[] searchEngines = {
35 "http://scholar.google.com/scholar?q=",
36 "http://www.scientificcommons.org/opensearch?search_string=",
37 "http://quod.lib.umich.edu/cgi/b/bib/bib-idx?type=boolean&c=oaister&rgn1=entire+record&op2=and&rgn2=entire+record&sort=weighted+hit+frequency&submit2=search&sourceid=Mozilla-search&q1=",
38 "http://www.scirus.com/srsapp/search?q=",
39 "http://eprints.rclis.org/perl/search/simple?title_srchtype=ALL&_satisfyall=ALL&full_merge=ALL&_order=byyear&_action_search=Submit&full="
42 Fetcher() {
43 goOn = true;
44 page = 0;
45 output = new StringBuffer();
48 //Returns the content of 'page'+'query' as String
49 //gzip only for scientificcommon!!
50 public String getPage(String page, String query, int number, boolean gzip) {
51 try {
52 while(goOn) {
53 goOn = false;
54 url = new URL(page);
55 if(!url.getProtocol().equals("http"))
56 throw new IllegalArgumentException("Protocol must be 'http:'");
58 host = url.getHost();
60 if((port = url.getPort()) == -1) port = 80;
62 //is required, because scientificcommons sends gziped even gzip;q=0 is set
63 if((filename = url.getFile()).equals(""))
64 filename = "/";
66 filename += transformQuery(page, query, number);
67 System.out.println("Filename: " + filename);
69 Socket socket = new Socket(host, port);
72 if(gzip)
73 //Only for scientificcommons!!!!
74 from_server = new GZIPInputStream(url.openStream());
75 else
76 from_server = socket.getInputStream();
78 to_server = new PrintWriter(socket.getOutputStream());
80 to_server.print("GET " + filename + " HTTP/1.1"+"\n");
81 to_server.print("Host: " + host + "\n");
82 to_server.print("User-Agent: my\n");
83 to_server.print("Accept: text/plain,text/html\n");
84 to_server.print("Accept-Language: en-us,en;q=0.5\n");
85 to_server.print("Accept-Encoding: gzip;q=0,deflate;q=0,compress;q=0\n");
86 to_server.print("Accept-Charset: ISO-8859-15\n");
87 //to_server.print("Keep-Alive: 300\n");
88 //to_server.print("Connection: keep-alive\n");
89 to_server.print("\n");
90 to_server.flush();
92 buffer = new byte[4096];
93 while((bytes_read = from_server.read(buffer)) != -1)
94 output.append(new String(buffer, 0, bytes_read));
96 socket.close();
98 }catch(Exception e) {
99 System.err.println(e);
101 //System.out.println(output.toString());
102 return output.toString();
105 //Returns the query compatible to the in searchEngine specified Searchengine
106 private String transformQuery(String searchEngine, String query, int number){
107 //scholar.google.com
108 if(searchEngine.equals(Fetcher.searchEngines[0])){
109 query = query.replaceAll("\\s", "+");
110 query += "&num=" + number; //max 100, if you need more use &num=100&start={0,100,200,300.....} and a the while(goOn)-loop
111 //goOn = false; //set true if you need the while(goOn)-loop
113 //scientificcommons.org/opensearch
114 else if(searchEngine.equals(Fetcher.searchEngines[1])) {
115 query = query.replaceAll("\\s", "%20");
117 query += "&start="+ (++page);
119 //gets 2 pages.... == 30 results
120 if(page < (number/15))
121 goOn = true;
122 //www.openaoister.com
123 } else if(searchEngine.equals(Fetcher.searchEngines[2])) {
124 query = query.replaceAll("\\s", "+");
125 query += "&size=" + number;
126 //scirus.com
127 } else if(searchEngine.equals(Fetcher.searchEngines[3])) {
128 query = query.replaceAll("\\s", "+");
129 query += "&p=" + page;
130 page += 10;
132 if(page < (number))
133 goOn = true;
134 //eprints.rclis.org
135 } else if(searchEngine.equals(Fetcher.searchEngines[4])) {
136 query = query.replaceAll("\\s", "+");
137 query += "&_offset=" + page;
138 page += 40;
140 if(page < (number))
141 goOn = true;
144 return query;