adding all of botlist, initial add
[botlist.git] / botlistprojects / botbert / scala / botlist_scanfeeds.scala
blob98f871f54e6268120f41646d29b360779e9a0144
1 //
2 // 4/20/2007
3 // botlist_scanfeeds.scala
4 // Read RSS feeds based on list stored in database
5 //
6 // Stage One:
7 // * Read the property file with settings (reader_properties.xml)
8 // Stage Two:
9 // * Scan the system_scan_feeds table for a list of feeds
10 // Stage Three:
11 // * Store all new data into system_feed_items, set process_count to zero
12 // Stage Four:
13 // * Check the entity_links table for existing content
14 // * Process a set number of item from feed_items and increase the process_count
15 // Stage Five:
16 // * Store in entity_links
18 import scala.Console
19 import java.io._
20 import scala.xml._
21 import java.net._
22 import org.jdom._
23 import org.jdom.{Document => JDocument}
24 import java.util.{Iterator => JIterator}
25 import org.jdom.input._
26 import scala.dbc._
27 import scala.dbc.Syntax._
28 import scala.dbc.statement._
29 import scala.dbc.statement.expression._
30 import scala.dbc.value._
31 import scala.dbc.syntax._
32 import syntax.Statement._
33 import scala.dbc.statement.{Select => DBCSelect}
34 import java.sql.{DriverManager, Connection, ResultSet, PreparedStatement, Statement, Date }
35 import DriverManager.{getConnection => connect}
37 import RichSQL._
38 import KeywordProcessor._
40 import java.util.Random
42 //------------------------------------------------
43 // Bean Definitions for Object Mapping
44 //------------------------------------------------
46 case class ScanFeed(main_url: String, url_title: String, url_description: String, url_source: String) {
47 val mainUrl = main_url
48 val urlTitle = url_title
49 val urlDescription = url_description
50 val urlSource = url_source
53 case class FeedItem(main_url: String, url_title: String, url_description: String, url_source: String) {
54 val mainUrl = main_url
55 val urlTitle = url_title
56 val urlDescription = url_description
57 val urlSource = url_source
60 case class EntityLink(main_url: String, url_title: String, url_description: String) {
61 val mainUrl = main_url
62 val urlTitle = url_title
63 val urlDescription = url_description
66 case class CountFeedItems(total: Int) {
67 val count = total
70 //------------------------------------------------
71 // End of Bean Definitions
72 //------------------------------------------------
74 /**
75 * BotList Scan Feeds
77 object BotListScanFeeds {
79 val full_name_user = "botbert99"
81 val useragent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1)"
83 val enable_proxy = "false"
84 val proxy_host = "the_proxy"
85 val proxy_port = "9999"
86 val update_delay = "20"
87 val debug_enabled = "true"
88 val append_file = "false"
89 val read_description = "false"
90 val split_url_line = "true"
91 val news_feed = "http://reddit.com/new.rss"
92 val use_feed_file = "true"
93 val feed_file = "newsfeeds.dat"
94 val maxFeedItems = 32
96 val rand = new Random(System.currentTimeMillis())
98 val nativeDriverClass = Class.forName("com.mysql.jdbc.Driver")
99 val conn = connect("jdbc:mysql://localhost/botlist_development", "user", "PASSWORD")
101 val valDefaultPropertyXML =
102 <ReaderConfiguration>
103 <settings>
104 <property name="enable.proxy" value={enable_proxy} />
105 <property name="proxy.host" value={proxy_host} />
106 <property name="proxy.port" value={proxy_port} />
107 <property name="update.delay" value={update_delay} />
108 <property name="debug.enabled" value={debug_enabled} />
109 <property name="append.file" value={append_file} />
110 <property name="read.description" value={read_description} />
111 <property name="news.feed" value={news_feed} />
112 <property name="use.feed.file" value={use_feed_file} />
113 <property name="feed.file" value={feed_file} />
114 </settings>
115 </ReaderConfiguration>;
117 def processData(urlString:String): List[(String, String, String)] =
118 try {
119 val builder = new SAXBuilder()
120 val doc = builder.build(new URL(urlString))
121 val root = doc.getRootElement()
122 val channel = root.getChild("channel")
123 val items = channel.getChildren()
124 val it = items.iterator
125 var data = List[(String, String, String)]()
126 while (it.hasNext) {
127 val eItem = it.next.asInstanceOf[Element]
128 if ("item" == eItem.getName()) {
129 val eTitle = eItem.getChild("title");
130 val eDescr = eItem.getChild("description");
131 val eLink = eItem.getChild("link");
132 data = (eTitle.getTextNormalize,
133 eDescr.getTextNormalize, eLink.getTextNormalize) :: data
136 data
137 } catch {
138 case ex:Exception => {
139 Console.println("ERR: Could not process data feed")
140 Console.println(ex)
141 List()
146 * Create a single feed item, used with (Stage Three)
148 def createItem(url: String, title: String, descr: String, source: String) {
149 val insertFeedItems = conn prepareStatement "insert into system_feed_items(main_url, url_title, url_description, url_source) values(?, ?, ?, ?)"
150 try {
152 // Use URL to check for malformed URLs
153 val checkURL = new URL(url)
154 if (title != null && title.length < 4) {
155 throw new RuntimeException("Invalid URL Title")
157 // Call the postfix operator '!' to execute
158 insertFeedItems << url << title << descr << source <<!;
159 } catch {
160 case ex:Exception => {
161 // Typically, a duplicate key error will be thrown here
162 //Console.println("ERR: Unable to create feed item")
163 //Console.println(ex)
169 * Stage Four, Check existing entity links for an already created
170 * link.
172 def hasEntityLink(link: String): boolean = {
173 implicit val s: Statement = conn << ;
174 for (val ctObj <- RichSQL.query("select count(1) from entity_links where main_url = '" + link + "'", rs => CountFeedItems(rs))) {
175 if (ctObj.count > 0)
176 return true
178 return false
181 def updateProcessCount(url: String) {
182 val processCount = conn prepareStatement "update system_feed_items set process_count = 1 where main_url = ?"
183 try {
184 // Call the postfix operator '!' to execute
185 processCount << url <<!;
186 } catch {
187 case ex:Exception => {
188 Console.println("ERR: failed to update process count=" + ex)
194 * Stage Five, create new entity link.
196 def createEntityLinkRecord(url: String, title:String) {
197 val insertEntityLink = conn prepareStatement "insert into entity_links(main_url, url_title, full_name, keywords, rating, created_on) values(?, ?, ?, ?, ?, NOW())"
198 try {
199 // We can get away with updating the process count, we can
200 // reprocess in later iterations
201 updateProcessCount(url)
202 val cleanTitle = filterNonAscii(title)
203 val keywordTitle = createKeywords(cleanTitle)
204 // Call the postfix operator '!' to execute
205 insertEntityLink << url << cleanTitle << full_name_user << keywordTitle << rand.nextInt(30) <<!;
206 } catch {
207 case ex:Exception => {
208 Console.println("ERR: failed to create entity link=" + ex)
214 * Stage Four and Five, check and then create the entity link.
216 def createEntityLinks() {
217 implicit val s: Statement = conn << ;
218 var ctr = 0
219 for (val feed <- query("select main_url, url_title, url_description, url_source from system_feed_items where process_count = 0 order by RAND()", rs => FeedItem(rs,rs,rs,rs))) {
220 val baseURL = feed.mainUrl
221 val title = feed.urlTitle
222 val descr = feed.urlDescription
223 // Check for already saved records
224 val hasLink = hasEntityLink(baseURL)
225 if (!hasLink) {
226 createEntityLinkRecord(baseURL, title)
227 ctr = ctr + 1;
228 if (ctr > maxFeedItems) { Console.println("WARN: max feed items reached (" + maxFeedItems + "), exiting."); return; }
229 } else {
230 Console.println("WARN: entity link already exists=" + baseURL)
236 * Stage Three, store all new data into system_feed_items,
237 * set process_count to zero.
239 def createFeedItems(list: List[(String, String, String)]) {
240 for (val item <- list) {
241 createItem(item._3, item._1, item._2, "http://www.google.com")
245 def saveXML(xml: Node, filename: String) = {
246 val tmpFile = new File(filename + ".xml")
247 val str = new FileWriter(tmpFile)
248 str.write(xml.toString())
249 str.close()
252 def loadXML(baseName: String):Node = {
253 val file = new File(baseName + ".xml")
254 Console.println("loading..." + file)
255 val xml = XML.load(file.getAbsolutePath)
260 * Stage Two, scan the feed list and get the URL to extract
261 * feed data.
263 def scanFeedList() {
264 implicit val s: Statement = conn << ;
265 for (val feed <- RichSQL.query("select main_url,url_title, url_description, url_source from system_scan_feeds",
266 rs => ScanFeed(rs,rs,rs,rs))) {
267 val baseURL = feed.mainUrl
268 try {
269 Console.print("processing... / ")
270 val res = processData(baseURL)
271 createFeedItems(res)
272 } catch {
273 case ex:Exception => {
274 Console.println("ERR: failed to process data and create feed item=" + ex)
277 Console.println("done.")
281 def main(args: Array[String]): Unit = {
283 Console.println(args(1))
284 val tcur = System.currentTimeMillis
285 val tmpFile = new File(args(1) + "/reader_properties.xml")
286 if (!tmpFile.exists) {
287 Console.println("* Writing new reader properties XML configuration file")
288 saveXML(valDefaultPropertyXML, args(1) + "/reader_properties")
289 } else {
290 Console.println("+ Not writing reader properties configure, already exists")
292 val nxml = loadXML(args(1) + "/reader_properties")
293 // Connect
294 val start = System.currentTimeMillis
295 System.getProperties().put("proxySet", enable_proxy)
296 System.getProperties().put("proxyHost", proxy_host)
297 System.getProperties().put("proxyPort", proxy_port)
298 System.getProperties().put("http.agent", useragent)
300 // Check for args that might contain -s=scan feed list and -e=create entity links
301 for (val arg <- args) {
302 Console.println("Args[ " + arg + "]")
305 // Initiate stage two, scan feeds
306 for (val arg <- args) {
307 // Check for scan feed flag
308 if (arg == "-s") {
309 Console.println("INFO: scan feed list flag enabled");
310 scanFeedList()
314 // Stage Four and Five, check for existing links and commit
315 for (val arg <- args) {
316 // Check for scan feed flag
317 if (arg == "-e") {
318 Console.println("INFO: create entity links flag enabled");
319 createEntityLinks
323 conn.close
324 val end = System.currentTimeMillis
325 Console.println("processed in=" + (end-start)/1000.0 + "s")