3 // botlist_scanfeeds.scala
4 // Read RSS feeds based on list stored in database
7 // * Read the property file with settings (reader_properties.xml)
9 // * Scan the system_scan_feeds table for a list of feeds
11 // * Store all new data into system_feed_items, set process_count to zero
13 // * Check the entity_links table for existing content
14 // * Process a set number of item from feed_items and increase the process_count
16 // * Store in entity_links
23 import org
.jdom
.{Document
=> JDocument
}
24 import java
.util
.{Iterator
=> JIterator
}
25 import org
.jdom
.input
._
27 import scala
.dbc
.Syntax
._
28 import scala
.dbc
.statement
._
29 import scala
.dbc
.statement
.expression
._
30 import scala
.dbc
.value
._
31 import scala
.dbc
.syntax
._
32 import syntax
.Statement
._
33 import scala
.dbc
.statement
.{Select
=> DBCSelect
}
34 import java
.sql
.{DriverManager
, Connection
, ResultSet
, PreparedStatement
, Statement
, Date
}
35 import DriverManager
.{getConnection
=> connect
}
38 import KeywordProcessor
._
40 import java
.util
.Random
42 //------------------------------------------------
43 // Bean Definitions for Object Mapping
44 //------------------------------------------------
46 case class ScanFeed(main_url
: String
, url_title
: String
, url_description
: String
, url_source
: String
) {
47 val mainUrl
= main_url
48 val urlTitle
= url_title
49 val urlDescription
= url_description
50 val urlSource
= url_source
53 case class FeedItem(main_url
: String
, url_title
: String
, url_description
: String
, url_source
: String
) {
54 val mainUrl
= main_url
55 val urlTitle
= url_title
56 val urlDescription
= url_description
57 val urlSource
= url_source
60 case class EntityLink(main_url
: String
, url_title
: String
, url_description
: String
) {
61 val mainUrl
= main_url
62 val urlTitle
= url_title
63 val urlDescription
= url_description
66 case class CountFeedItems(total
: Int
) {
70 //------------------------------------------------
71 // End of Bean Definitions
72 //------------------------------------------------
77 object BotListScanFeeds
{
79 val full_name_user
= "botbert99"
81 val useragent
= "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1)"
83 val enable_proxy
= "false"
84 val proxy_host
= "the_proxy"
85 val proxy_port
= "9999"
86 val update_delay
= "20"
87 val debug_enabled
= "true"
88 val append_file
= "false"
89 val read_description
= "false"
90 val split_url_line
= "true"
91 val news_feed
= "http://reddit.com/new.rss"
92 val use_feed_file
= "true"
93 val feed_file
= "newsfeeds.dat"
96 val rand
= new Random(System
.currentTimeMillis())
98 val nativeDriverClass
= Class
.forName("com.mysql.jdbc.Driver")
99 val conn
= connect("jdbc:mysql://localhost/botlist_development", "user", "PASSWORD")
101 val valDefaultPropertyXML
=
102 <ReaderConfiguration
>
104 <property name
="enable.proxy" value
={enable_proxy
} />
105 <property name
="proxy.host" value
={proxy_host
} />
106 <property name
="proxy.port" value
={proxy_port
} />
107 <property name
="update.delay" value
={update_delay
} />
108 <property name
="debug.enabled" value
={debug_enabled
} />
109 <property name
="append.file" value
={append_file
} />
110 <property name
="read.description" value
={read_description
} />
111 <property name
="news.feed" value
={news_feed
} />
112 <property name
="use.feed.file" value
={use_feed_file
} />
113 <property name
="feed.file" value
={feed_file
} />
115 </ReaderConfiguration
>;
117 def processData(urlString
:String
): List
[(String
, String
, String
)] =
119 val builder
= new SAXBuilder()
120 val doc
= builder
.build(new URL(urlString
))
121 val root
= doc
.getRootElement()
122 val channel
= root
.getChild("channel")
123 val items
= channel
.getChildren()
124 val it
= items
.iterator
125 var data
= List
[(String
, String
, String
)]()
127 val eItem
= it
.next
.asInstanceOf
[Element
]
128 if ("item" == eItem
.getName()) {
129 val eTitle
= eItem
.getChild("title");
130 val eDescr
= eItem
.getChild("description");
131 val eLink
= eItem
.getChild("link");
132 data
= (eTitle
.getTextNormalize
,
133 eDescr
.getTextNormalize
, eLink
.getTextNormalize
) :: data
138 case ex
:Exception
=> {
139 Console
.println("ERR: Could not process data feed")
146 * Create a single feed item, used with (Stage Three)
148 def createItem(url
: String
, title
: String
, descr
: String
, source
: String
) {
149 val insertFeedItems
= conn prepareStatement
"insert into system_feed_items(main_url, url_title, url_description, url_source) values(?, ?, ?, ?)"
152 // Use URL to check for malformed URLs
153 val checkURL
= new URL(url
)
154 if (title
!= null && title
.length
< 4) {
155 throw new RuntimeException("Invalid URL Title")
157 // Call the postfix operator '!' to execute
158 insertFeedItems
<< url
<< title
<< descr
<< source
<<!;
160 case ex
:Exception
=> {
161 // Typically, a duplicate key error will be thrown here
162 //Console.println("ERR: Unable to create feed item")
163 //Console.println(ex)
169 * Stage Four, Check existing entity links for an already created
172 def hasEntityLink(link
: String
): boolean = {
173 implicit val s
: Statement
= conn
<< ;
174 for (val ctObj
<- RichSQL
.query("select count(1) from entity_links where main_url = '" + link
+ "'", rs
=> CountFeedItems(rs
))) {
181 def updateProcessCount(url
: String
) {
182 val processCount
= conn prepareStatement
"update system_feed_items set process_count = 1 where main_url = ?"
184 // Call the postfix operator '!' to execute
185 processCount
<< url
<<!;
187 case ex
:Exception
=> {
188 Console
.println("ERR: failed to update process count=" + ex
)
194 * Stage Five, create new entity link.
196 def createEntityLinkRecord(url
: String
, title
:String
) {
197 val insertEntityLink
= conn prepareStatement
"insert into entity_links(main_url, url_title, full_name, keywords, rating, created_on) values(?, ?, ?, ?, ?, NOW())"
199 // We can get away with updating the process count, we can
200 // reprocess in later iterations
201 updateProcessCount(url
)
202 val cleanTitle
= filterNonAscii(title
)
203 val keywordTitle
= createKeywords(cleanTitle
)
204 // Call the postfix operator '!' to execute
205 insertEntityLink
<< url
<< cleanTitle
<< full_name_user
<< keywordTitle
<< rand
.nextInt(30) <<!;
207 case ex
:Exception
=> {
208 Console
.println("ERR: failed to create entity link=" + ex
)
214 * Stage Four and Five, check and then create the entity link.
216 def createEntityLinks() {
217 implicit val s
: Statement
= conn
<< ;
219 for (val feed
<- query("select main_url, url_title, url_description, url_source from system_feed_items where process_count = 0 order by RAND()", rs
=> FeedItem(rs
,rs
,rs
,rs
))) {
220 val baseURL
= feed
.mainUrl
221 val title
= feed
.urlTitle
222 val descr
= feed
.urlDescription
223 // Check for already saved records
224 val hasLink
= hasEntityLink(baseURL
)
226 createEntityLinkRecord(baseURL
, title
)
228 if (ctr
> maxFeedItems
) { Console
.println("WARN: max feed items reached (" + maxFeedItems
+ "), exiting."); return; }
230 Console
.println("WARN: entity link already exists=" + baseURL
)
236 * Stage Three, store all new data into system_feed_items,
237 * set process_count to zero.
239 def createFeedItems(list
: List
[(String
, String
, String
)]) {
240 for (val item
<- list
) {
241 createItem(item
._3
, item
._1
, item
._2
, "http://www.google.com")
245 def saveXML(xml
: Node
, filename
: String
) = {
246 val tmpFile
= new File(filename
+ ".xml")
247 val str
= new FileWriter(tmpFile
)
248 str
.write(xml
.toString())
252 def loadXML(baseName
: String
):Node
= {
253 val file
= new File(baseName
+ ".xml")
254 Console
.println("loading..." + file
)
255 val xml
= XML
.load(file
.getAbsolutePath
)
260 * Stage Two, scan the feed list and get the URL to extract
264 implicit val s
: Statement
= conn
<< ;
265 for (val feed
<- RichSQL
.query("select main_url,url_title, url_description, url_source from system_scan_feeds",
266 rs
=> ScanFeed(rs
,rs
,rs
,rs
))) {
267 val baseURL
= feed
.mainUrl
269 Console
.print("processing... / ")
270 val res
= processData(baseURL
)
273 case ex
:Exception
=> {
274 Console
.println("ERR: failed to process data and create feed item=" + ex
)
277 Console
.println("done.")
281 def main(args
: Array
[String
]): Unit
= {
283 Console
.println(args(1))
284 val tcur
= System
.currentTimeMillis
285 val tmpFile
= new File(args(1) + "/reader_properties.xml")
286 if (!tmpFile
.exists
) {
287 Console
.println("* Writing new reader properties XML configuration file")
288 saveXML(valDefaultPropertyXML
, args(1) + "/reader_properties")
290 Console
.println("+ Not writing reader properties configure, already exists")
292 val nxml
= loadXML(args(1) + "/reader_properties")
294 val start
= System
.currentTimeMillis
295 System
.getProperties().put("proxySet", enable_proxy
)
296 System
.getProperties().put("proxyHost", proxy_host
)
297 System
.getProperties().put("proxyPort", proxy_port
)
298 System
.getProperties().put("http.agent", useragent
)
300 // Check for args that might contain -s=scan feed list and -e=create entity links
301 for (val arg
<- args
) {
302 Console
.println("Args[ " + arg
+ "]")
305 // Initiate stage two, scan feeds
306 for (val arg
<- args
) {
307 // Check for scan feed flag
309 Console
.println("INFO: scan feed list flag enabled");
314 // Stage Four and Five, check for existing links and commit
315 for (val arg
<- args
) {
316 // Check for scan feed flag
318 Console
.println("INFO: create entity links flag enabled");
324 val end
= System
.currentTimeMillis
325 Console
.println("processed in=" + (end
-start
)/1000.0 + "s")