Parser.java

   1 /** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
   2  * Copyright 2008 Ledermueller Achim
   3  *
   4  * serc is free software: you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation, either version 3 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
  17
  18 import java.util.regex.Pattern;
  19 import java.util.regex.Matcher;
  20 import java.util.Vector;
  21 import java.net.URL;
  22
  23 class Parser {
  24
  25         private Pattern pattern;
  26         private ResultSet rSet = new ResultSet();
  27         private Result result = new Result();
  28
  29         public ResultSet extractData(String searchEngine, String page) {
  30                 if(searchEngine.equals(Fetcher.searchEngines[0]) || searchEngine.equals(Fetcher.searchEngines[2])) {
  31                         pattern  = Pattern.compile("\"\\s*(http|ftp)s?://.*?\"");
  32                 } else if(searchEngine.equals(Fetcher.searchEngines[1])) {
  33                         pattern  = Pattern.compile("<link>.*?</link>");
  34                 } else if(searchEngine.equals(Fetcher.searchEngines[3])) {
  35                         page = page.replaceAll("%3A",":");
  36                         page = page.replaceAll("%3B",";");
  37                         page = page.replaceAll("%3D","=");
  38                         page = page.replaceAll("%3F","?");
  39                         page = page.replaceAll("%23","#");
  40                         page = page.replaceAll("%22","\"");
  41                         page = page.replaceAll("%25","%");
  42                         page = page.replaceAll("%27","'");
  43                         page = page.replaceAll("%2B","+");
  44                         page = page.replaceAll("%26","&");
  45                         page = page.replaceAll("%2F","/");
  46                         page = page.replaceAll("%7E","~");
  47                         pattern  = Pattern.compile("\\?src=web&url=(http|ftp)s?://.*?\"");
  48                 } else if(searchEngine.equals(Fetcher.searchEngines[4])) {
  49                         pattern  = Pattern.compile("\"\\s*http://eprints\\.rclis\\.org/archive/.*?\"");
  50                 } else if(searchEngine.equals("sc")) {
  51                         pattern  = Pattern.compile("\"\\s*(http)s?://.*?\"");
  52                 } else if(searchEngine.equals("elis")) {
  53                         //there is also a alternative location - later
  54                         pattern  = Pattern.compile("\"\\s*http://eprints\\.rclis\\.org/archive/.*?\\.pdf\"");
  55                 } else {
  56                         pattern  = Pattern.compile("<link>.*?</link>");
  57                 }
  58
  59                 Matcher matcher = pattern.matcher(page);
  60
  61                 try{
  62                   //first if-else-clause the while would be better
  63                         while(matcher.find()) {
  64                                 if(searchEngine.equals(Fetcher.searchEngines[0])) {
  65
  66                       if(
  67                               (matcher.group().indexOf("scholar?") == -1 &&
  68                                matcher.group().indexOf("http://google.com/search?") == -1 &&
  69                                matcher.group().indexOf("http://www.google.com/search?") == -1 &&
  70                                matcher.group().indexOf("http://images.google.com/images?") == -1 &&
  71                                matcher.group().indexOf("http://video.google.com/videosearch?") == -1 &&
  72                                matcher.group().indexOf("http://news.google.com/news?") == -1 &&
  73                                matcher.group().indexOf("http://maps.google.com/maps?") == -1 &&
  74                                matcher.group().indexOf("http://google.com/search?") == -1 &&
  75                                matcher.group().indexOf("http://www.google.com/webhp?") == -1 &&
  76                                matcher.group().indexOf("http://www.google.com/webhp?") == -1 &&
  77                                matcher.group().indexOf("worldcat.org") == -1) &&
  78                                !matcher.group().equals("http://www.google.com/intl/en/about.html")
  79                         )
  80                               {
  81                                       rSet.addResult(new Result(new URL(matcher.group().replace('"', ' ').trim())));
  82                               }
  83                         } else if(searchEngine.equals(Fetcher.searchEngines[1])){
  84
  85                                 String str = matcher.group().replaceAll("<link>","");
  86                                 str = str.replaceAll("</link>","");
  87                                 if(str.indexOf("opensearch") == -1) {
  88                                         Fetcher f = new Fetcher();
  89                                         Parser p = new Parser();
  90                                         str = f.getPage(str, "", 0, true);
  91                                         rSet.addAll(p.extractData("sc", str));
  92
  93                                 }
  94                         } else if(searchEngine.equals(Fetcher.searchEngines[2])) {
  95
  96                                 if(
  97                                                 matcher.group().indexOf("quod.lib.umich.edu") == -1 &&
  98                                                 matcher.group().indexOf("google-analytics.com") == -1 &&
  99                                                 matcher.group().indexOf("www.oaister.org") == -1 &&
 100                                                 matcher.group().indexOf("openarchives.org") == -1 &&
 101                                                 matcher.group().indexOf("www.lib.umich.edu/opensearch/oaister.xml") == -1 &&
 102                                                 matcher.group().equals("\"http://www.umdl.umich.edu/\"") == false &&
 103                                                 matcher.group().equals("\"http://www.umich.edu/\"") == false
 104                                   )
 105                                 {
 106                                         rSet.addResult(new Result(new URL(matcher.group().replace('"', ' ').trim())));
 107                                 }
 108                         } else if(searchEngine.equals(Fetcher.searchEngines[3])) {
 109                                 if(matcher.group().indexOf("mail.elsevier-alerts.com") == -1) {
 110                                         rSet.addResult(new Result(new URL(matcher.group().replace('"', ' ').trim().replaceAll("\\?src=web&url=", ""))));
 111                                 }
 112                         } else if(searchEngine.equals(Fetcher.searchEngines[4])) {
 113
 114                                 String str = matcher.group().replace('"', ' ').trim();
 115                                 Fetcher f = new Fetcher();
 116                                 Parser p = new Parser();
 117                                 str = f.getPage(str, "", 0, false);
 118                                 rSet.addAll(p.extractData("elis", str));
 119
 120                         } else if(searchEngine.equals("sc")) {
 121                                 if(
 122                                                 matcher.group().indexOf("scientificcommons.org") == -1 &&
 123                                                 matcher.group().indexOf("www.w3.org") == -1 &&
 124                                                 matcher.group().indexOf("google") == -1
 125                                   )
 126                                 {
 127
 128                                                 result.addUrl(new URL(matcher.group().replace('"', ' ').trim()));
 129                                         }
 130
 131                                 } else if(searchEngine.equals("elis")) {
 132                                                 result.addUrl(new URL(matcher.group().replace('"', ' ').trim()));
 133
 134                                 }
 135                         }
 136                         //if a result with more than one url was needed (scientificcommons)
 137                         if(!result.isEmpty())
 138                                 rSet.addResult(result);
 139
 140                 }catch(Exception e) {
 141                         System.err.println(e);
 142                 }
 143
 144                 return rSet;
 145         }
 146 }