Fetching code for wakabas
[PyChans.git] / updateArchive.py
blobb3ce4142dbc9eaa72d09f2b3ec7f19273972a290
1 # coding=utf-8
2 from paste.script.command import Command
3 from fc.lib.base import *
4 from fc.model import *
5 from sqlalchemy.orm import eagerload
6 from sqlalchemy.orm import class_mapper
7 from sqlalchemy.sql import and_, or_, not_
8 import sqlalchemy
9 import os
10 import cgi
11 import shutil
12 import datetime
13 import time
14 import Image
15 import hashlib
16 import re
17 from fc.lib.fuser import FUser
18 from fc.lib.miscUtils import *
19 from fc.lib.constantValues import *
20 from fc.lib.settings import *
21 from fc.lib.fileHolder import AngryFileHolder
22 import sys
23 import paste.fixture
24 import paste.registry
25 import paste.deploy.config
26 from paste.deploy import loadapp, appconfig
27 from paste.script.command import Command, BadCommand
28 from paste.script.filemaker import FileOp
29 from paste.script.pluginlib import find_egg_info_dir
30 import urllib2
31 import httplib
32 from lxml import etree
33 import StringIO
34 from fc.model.arch import *
35 import logging
37 def can_import(name):
38 """Attempt to __import__ the specified package/module, returning True when
39 succeeding, otherwise False"""
40 try:
41 __import__(name)
42 return True
43 except ImportError:
44 return False
46 def unicodify(text):
47 if isinstance(text, str):
48 text = text.decode('utf-8')
49 return text
51 idList = {}
52 GFilters = {}
54 class DateTimeParser:
55 monthes = [('Янв','Jan','января'),('Фев','Feb','февраля'),('Мар','Mar','марта'),('Апр','Apr','апреля'),('Май','May','мая'),('Июн','Jun','июня'),('Июл','Jul','июля'),('Авг','Aug','августа'),('Сен','Sep','сентября'),('Окт','Oct','октября'),('Ноя','Nov','ноября'),('Дек','Dec','декабря')]
56 dateRe = re.compile(r"""[^\d]+(\d+)\s+([^\d\s]+)\s+(\d+)\s+(\d+)\:(\d+)\:(\d+)""")
57 dateReISO = re.compile(r"""(\d+)\-(\d+)\-(\d+) (\d+)\:(\d+)\:(\d+)""")
58 def getDateTime(self,date):
59 dateP = self.dateRe.findall(date)
60 dateP = dateP[0]
61 mi = 0
62 f = False
63 for mm in self.monthes:
64 mi = mi + 1
65 if dateP[1] in mm:
66 f = True
67 break
68 if f:
69 return datetime.datetime(int(dateP[2]),mi,int(dateP[0]),int(dateP[3]),int(dateP[4]),int(dateP[5]))
70 else:
71 return None
72 def getDateTimeFromISO8601(self,date):
73 dateP = self.dateReISO.findall(date)
74 dateP = dateP[0]
75 return datetime.datetime(int(dateP[0]),int(dateP[1]),int(dateP[2]),int(dateP[3]),int(dateP[4]),int(dateP[5]))
77 DTP = DateTimeParser()
79 class IBParser:
80 def GetNextTag(self,el,tag,skip=0):
81 tag = tag.lower()
82 if skip:
83 r = el.getnext()
84 else:
85 r = el
86 if not r.tag or r.tag.lower() != tag:
87 while (r.getnext() != None) and not (r.getnext().tag and r.getnext().tag.lower() == tag):
88 r = r.getnext()
89 if r.getnext() != None:
90 r = r.getnext()
91 if r.tag and r.tag.lower() == tag:
92 return r
93 else:
94 return None
96 def GetPreviousTag(self,el,tag,skip=0):
97 tag = tag.lower()
98 if skip:
99 r = el.getprevious()
100 else:
101 r = el
102 if not r.tag or r.tag.lower() != tag:
103 while (r.getprevious() != None) and not (r.getprevious().tag and r.getprevious().tag.lower() == tag):
104 r = r.getprevious()
105 if r.getprevious() != None:
106 r = r.getprevious()
107 if r.tag and r.tag.lower() == tag:
108 return r
109 else:
110 return None
111 def ResolveSecondaryId(self,thread,Ids):
112 id = int(Ids[1])
113 if id in idList:
114 return idList[id][0]
116 tagsf = and_(Post.tags.any(tag=thread.chanTag),Post.tags.any(tag=thread.board))
117 f2 = and_(Post.parentid==-1,tagsf)
118 f1 = and_(Post.secondaryIndex==Ids[0],f2)
119 thread = meta.Session.query(Post).filter(f1).first()
120 if thread:
121 if Ids[0] == Ids[1]:
122 return thread.id
123 else:
124 post = meta.Session.query(Post).filter(and_(Post.secondaryIndex==int(Ids[1]),Post.parentid==thread.id)).first()
125 if post:
126 return post.id
127 else:
128 return None
129 else:
130 return None
131 def GetPostID(self,post):
132 if post.thread:
133 ids = self.replyIdRe.findall(post.href)
134 return [post.thread.tid,int(ids[0])]
135 else:
136 ids = self.postIdRe.findall(post.href)
137 return [int(ids[0][0]),ids[0][2] and int(ids[0][2]) or int(ids[0][0])]
139 class Loader:
140 def parseLink(self,link):
141 s1 = link.split('://')
142 p = len(s1)>1 and s1[0] or None
143 p2= p and (p+'://') or ''
144 s2 = s1[-1].split('/')
145 return [p, s2[0], p2 + s2[0] + '/', p2 + '/'.join(s2[:-1]) + '/', s2[-1],'/'+'/'.join(s2[1:])]
147 class LoaderLocal(Loader):
148 def __init__(self,link):
149 p = self.parseLink(link)
150 self.relativeUrl = p[3]
151 def stat(self,link):
152 try:
153 stats = os.stat(link)
154 return [datetime.datetime.fromtimestamp(stats[8]),stats[6]]
155 except OSError:
156 return None
157 def get(self,url):
158 return open(url,'rb').read()
159 def getAbsolutePath(self,url):
160 return self.relativeUrl + url
161 def getFromRelative(self,url):
162 return self.get(self.getAbsolutePath(url))
164 class LoaderHTTP(Loader):
165 def __init__(self,link):
166 p = self.parseLink(link)
167 self.proto = p[0]
168 self.host = p[1]
169 self.baseUrl = p[2]
170 self.relativeUrl = p[3]
171 def stat(self,link):
172 linkp = self.parseLink(link)
173 c = httplib.HTTPConnection(linkp[1])
174 c.request('HEAD', linkp[5])
175 r = c.getresponse()
176 if r.status == 200:
177 size = r.getheader('content-length',0)
178 date = r.getheader('last-modified',r.getheader('date',None))
179 return [DTP.getDateTime(date),size]
180 elif r.status == 404:
181 return None
182 else:
183 return None
184 def get(self,url):
185 req = urllib2.Request(url)
186 req.add_header('Referer', self.baseUrl)
187 try:
188 f = urllib2.urlopen(req)
189 res = f.read()
190 return res
191 except urllib2.HTTPError:
192 return None
193 def getAbsolutePath(self,url):
194 if url[0] == '/':
195 return self.baseUrl + url
196 else:
197 return self.relativeUrl + url
198 def getFromRelative(self,url):
199 return self.get(self.getAbsolutePath(url))
200 class IBFilter:
201 def filter(self,post):
202 return None
203 class IBFilterSage(IBFilter):
204 def filter(self,post):
205 return post.sage
206 class IBFilterLowres(IBFilter):
207 def filter(self,post):
208 return post.pic and post.pic.width < 50
210 class Thread:
211 def __init__(self,entry,parsers,directlink=None,forcetype=None):
212 self.parser = parsers[entry.type]
213 self.tid = entry.tid
214 self.url = entry.url
215 self.board = entry.board
216 self.chanTag= entry.chanTag
217 self.tags = entry.tags and entry.tags.split(',') or []
218 self.type = entry.type
219 self.forcetype = forcetype
220 self.lastChanged = entry.lastChanged
221 self.filters = []
222 filters = entry.filters and entry.filters.split(',') or []
223 if filters:
224 for f in filters:
225 self.filters.append(GFilters[f])
227 self.timeDiff = entry.timeDiff
228 self.directlink = directlink
229 self.loader = Loader()
230 if not self.directlink:
231 self.directlink = self.parser.GetThreadLink(self.url,self.board,self.tid)
232 if self.loader.parseLink(self.directlink)[0]:
233 self.loader = LoaderHTTP(self.directlink)
234 else:
235 self.loader = LoaderLocal(self.directlink)
236 def checkState(self):
237 stat = self.loader.stat(self.directlink)
238 if not stat:
239 return [404]
240 elif stat[0] > self.lastChanged:
241 return [200,stat[0],stat[1]]
242 else:
243 return [304,stat[0],stat[1]]
244 def initialize(self):
245 page = self.loader.get(self.directlink)
246 if page:
247 parser = etree.HTMLParser()
248 if isinstance(page, str):
249 page = page.decode('utf-8')
250 self.document = etree.parse(StringIO.StringIO(page), parser)
251 self.posts = self.parser.GetPostsList(self)
252 self.threadId = self.parser.ResolveSecondaryId(self,[self.tid,self.tid])
253 if self.posts:
254 return True
255 else:
256 return False
257 else:
258 return False
259 def filter(self,post):
260 fl = None
261 if self.filters:
262 for f in self.filters:
263 fl = fl or f.filter(post)
264 return fl
265 def ReplaceReference(self,m):
266 mgg = m.groups()
267 mg = [mgg[1],mgg[2]]
268 tid = self.parser.ResolveSecondaryId(self,[mg[0],mg[0]])
269 if tid:
270 if mg[0] != mg[1]:
271 pid = self.parser.ResolveSecondaryId(self,[mg[0],mg[1]])
272 else:
273 pid = tid
274 if pid:
275 return '<a href="/%s#i%s" onclick="highlight(%s)">&gt;&gt;%s</a>' % (tid, pid, pid, mg[1])
276 print "ERROR! %s/%s does not exist!" % (mg[0],mg[1])
277 return '<a href="/secondaryIndex/%s#i%s" onclick="highlight(%s)">&gt;&gt;%s</a>' % (mg[0], mg[1], mg[1], mg[1])
279 class WakabaParser(IBParser):
280 replyIdRe = re.compile(r""">>(\d+)""")
281 postIdRe = re.compile(r"""\/(\d+)\.x?h?t?ml?(#i?(\d+))?""")
282 referenceRe = re.compile("""<a [^>]*href="([^"]*/)?(\d+)\.[^"]+"[^>]*>\&gt\;\&gt\;(\d+)</a>""")
283 def GetThreadLink(self,url,board,thread):
284 return 'http://'+url+'/'+board+'/res/'+str(thread)+'.html'
285 def GetPostsList(self,thread):
286 posts = thread.document.xpath("/html/body/form//*[@class='reflink']/a")
287 postsList = []
288 if posts:
289 for postA in posts:
290 post = Post()
291 post.thread = thread
292 post.href = postA.get('href')
293 post.reflink = postA.getparent()
294 post.Ids = self.GetPostID(post)
295 post.secondaryIndex = int(post.Ids[1])
296 postsList.append(post)
297 return postsList
298 else:
299 return None
300 def GetImgSrc(self,post):
301 cont = post.l.getparent()
302 for t in cont:
303 if t.tag.lower() == 'a':
304 href = t.get('href')
305 if href and href.find('/src/') != -1:
306 if post.thread.forcetype:
307 return '../src/' + post.thread.loader.parseLink(href)[4]
308 else:
309 return href
310 return None
312 def ParseText(self,post):
313 if post.bq is not None:
314 post.bq.tail = ''
315 message = etree.tostring(post.bq, pretty_print=False,encoding='utf-8')
316 if message[:12].lower() == '<blockquote>' and message[-13:].lower() == '</blockquote>':
317 message = message[12:-13]
318 else:
319 print "Cant parse this message : '%s'" % message
320 return None
321 message = self.referenceRe.sub(post.thread.ReplaceReference,message)
322 return message
323 else:
324 return u''
325 def parsePost(self,post):
326 post.bq = self.GetNextTag(post.reflink,'blockquote')
327 post.l = self.GetPreviousTag(post.reflink,'label')
328 post.title = unicodify(post.l[1].text)
329 if not post.title:
330 post.title = u''
331 post.cpn = post.l[2]
332 post.sage = False
333 if len(post.cpn)>0 and post.cpn[0].tag.lower() == 'a':
334 post.cpnHref = post.cpn[0].get('href')
335 if post.cpnHref.find('sage') > -1:
336 post.sage = True
337 post.src = self.GetImgSrc(post)
338 date = post.l[2].tail.encode('utf-8')
339 date = date.replace("\r",'').replace("\n",'')
340 post.date = DTP.getDateTime(date)
341 post.message = unicodify(self.ParseText(post))
343 class UpdateArchive(Command):
344 # Parser configuration
345 summary = "--NO SUMMARY--"
346 usage = "--NO USAGE--"
347 group_name = "fc"
348 parser = Command.standard_parser(verbose=False)
349 parser.add_option("--mode")
350 parser.add_option("--chan")
351 parser.add_option("--board")
352 parser.add_option("--thread")
353 parser.add_option("--chanTag")
354 parser.add_option("--type")
355 parser.add_option("--tags")
356 parser.add_option("--timeDiff")
357 parser.add_option("--directlink")
358 parser.add_option("--list")
359 parser.add_option("--filters")
360 parser.add_option("--forcetype")
361 parsers = {'wakaba':WakabaParser()}
362 def command(self):
363 """Main command to create a new shell"""
364 self.verbose = 3
365 config_file = 'development.ini'
366 config_name = 'config:%s' % config_file
367 here_dir = os.getcwd()
368 locs = dict(__name__="pylons-admin")
369 conf = appconfig(config_name, relative_to=here_dir)
370 conf.update(dict(app_conf=conf.local_conf,global_conf=conf.global_conf))
371 paste.deploy.config.CONFIG.push_thread_config(conf)
372 sys.path.insert(0, here_dir)
373 wsgiapp = loadapp(config_name, relative_to=here_dir)
374 test_app = paste.fixture.TestApp(wsgiapp)
375 tresponse = test_app.get('/_test_vars')
376 request_id = int(tresponse.body)
377 test_app.pre_request_hook = lambda self:paste.registry.restorer.restoration_end()
378 test_app.post_request_hook = lambda self:paste.registry.restorer.restoration_begin(request_id)
379 paste.registry.restorer.restoration_begin(request_id)
380 egg_info = find_egg_info_dir(here_dir)
381 f = open(os.path.join(egg_info, 'top_level.txt'))
382 packages = [l.strip() for l in f.readlines() if l.strip() and not l.strip().startswith('#')]
383 f.close()
384 found_base = False
385 for pkg_name in packages:
386 # Import all objects from the base module
387 base_module = pkg_name + '.lib.base'
388 found_base = can_import(base_module)
389 if not found_base:
390 # Minimal template
391 base_module = pkg_name + '.controllers'
392 found_base = can_import(base_module)
394 if found_base:
395 break
397 if not found_base:
398 raise ImportError("Could not import base module. Are you sure this is a Pylons app?")
400 base = sys.modules[base_module]
401 base_public = [__name for __name in dir(base) if not \
402 __name.startswith('_') or __name == '_']
403 for name in base_public:
404 locs[name] = getattr(base, name)
405 locs.update(dict(wsgiapp=wsgiapp, app=test_app))
407 mapper = tresponse.config.get('routes.map')
408 if mapper:
409 locs['mapper'] = mapper
412 self.thread = self.options.thread
413 self.chan = self.options.chan
414 self.chanTag = self.options.chanTag
415 self.board = self.options.board
417 logging.getLogger('sqlalchemy').setLevel(logging.ERROR)
418 GFilters['sage'] = IBFilterSage()
419 GFilters['lowres'] = IBFilterLowres()
420 #logging.getLogger( 'sqlalchemy').setLevel( logging.NONE )
421 if not self.options.mode or self.options.mode == 'update':
422 self.UpdateArchive()
423 elif self.options.mode == 'add':
424 self.AddToArchive()
425 elif self.options.mode == 'thread':
426 if self.options.list:
427 f = open(self.options.list,'r')
428 tList = f.readlines()
429 else:
430 tList = [self.options.thread]
431 for t in tList:
432 entry = ArchiveList()
433 entry.tid = int(t)
434 entry.url = self.options.chan
435 entry.chanTag = self.options.chanTag
436 entry.board = self.options.board or 'b'
437 entry.tags = self.options.tags or ''
438 entry.type = self.options.type or 'wakaba'
439 entry.filters = self.options.filters or ''
440 entry.timeDiff = self.options.timeDiff or 0
441 entry.lastChanged = datetime.datetime.fromtimestamp(0)
442 print "Processing %s %s %s %s" % (entry.tid,entry.url,entry.chanTag,entry.board)
443 thread = Thread(entry,self.parsers,self.options.directlink,self.options.forcetype)
444 self.processThread(thread)
446 def LoadPage(self,thread,chan='2ch.ru',board='b'):
447 self.host = 'http://'+chan
448 if thread:
449 self.path = '/'+board+'/res/'
450 self.url = self.host+self.path+thread+'.html'
451 else:
452 self.path = '/'+board+'/'
453 self.url = self.host+self.path
454 print self.url
455 req = urllib2.Request(self.url)
456 req.add_header('Referer', self.host+'/'+board+'/')
457 f = urllib2.urlopen(req)
458 res = f.read()
459 return res
461 def getTags(self,tagsList):
462 tags = []
463 for tagName in tagsList:
464 tag = meta.Session.query(Tag).filter(Tag.tag==tagName).first()
465 if tag:
466 tags.append(tag)
467 else:
468 tags.append(Tag(tagName))
469 return tags
471 def processPost(self,post):
472 post.thread.parser.parsePost(post)
473 post.pic = False
474 if post.src:
475 post.pic = self.LoadImage(post)
476 if post.pic == -1:
477 post.pic = None
478 if post.pic:
479 post.picid = post.pic.id
480 print "Thread %s Post %s (Image:%s %s %sx%s) at %s, sage : %s" % (post.Ids[0],post.Ids[1],post.src,post.pic and post.pic.id or 0,post.pic and post.pic.width or 0,post.pic and post.pic.height or 0,post.date,post.sage)
481 if (post.thread.filter(post)):
482 print "Filtered out"
483 print "----------------------"
484 else:
485 if post.Ids[0] == post.Ids[1]:
486 post.parentid = -1
487 post.replyCount = 0
488 post.bumpDate = post.date
489 post.tags = self.getTags([post.thread.chanTag,post.thread.board]+post.thread.tags)
490 post.thread.post = post
491 else:
492 post.parentid = post.thread.post.id
493 if not post.sage:
494 post.thread.post.bumpDate = post.date
495 post.thread.post.replyCount += 1
496 post.uidNumber = 1
497 meta.Session.save(post)
498 meta.Session.commit()
499 idList[post.Ids[1]]=[post.id,post.Ids[0]]
500 print "Saved in DB as %s/%s" % (post.id,post.parentid)
501 print "----------------------"
503 def processThread(self,thread):
504 if thread.initialize():
505 if thread.threadId:
506 thread.post = meta.Session.query(Post).get(thread.threadId)
507 lastPost = meta.Session.query(Post).filter(Post.parentid==thread.post.id).filter(Post.secondaryIndex>0).order_by(Post.secondaryIndex.desc()).first()
508 if lastPost:
509 lastId = lastPost.secondaryIndex
510 else:
511 lastId = int(thread.tid)
512 else:
513 lastId = 0
514 skipped = 0
515 for post in thread.posts:
516 if int(post.Ids[1]) > lastId:
517 if skipped:
518 print "Skipped %s out of %s posts" % (skipped,len(thread.posts))
519 skipped=0
520 self.processPost(post)
521 else:
522 skipped += 1
523 if skipped:
524 print "Skipped %s out of %s posts" % (skipped,len(thread.posts))
527 def LoadImage(self,post):
528 url = post.thread.loader.getAbsolutePath(post.src)
529 fileName = post.thread.loader.parseLink(url)[4]
530 res = post.thread.loader.getFromRelative(post.src)
531 if res:
532 localFilePath = os.path.join(g.OPT.uploadPath, fileName)
533 localFile = open(localFilePath,'wb')
534 localFile.write(res)
535 localFile.close()
536 file = FieldStorageLike(fileName,localFilePath)
537 fileDescriptors = self.processFile(file, 200)
538 pic = False
539 if fileDescriptors:
540 pic = fileDescriptors[0]
541 fileHolder = fileDescriptors[1]
542 if pic and pic != -1 and fileHolder:
543 fileHolder.disableDeletion()
544 return pic
545 else:
546 return None
548 def processFile(self, file, thumbSize=250):
549 if isinstance(file, cgi.FieldStorage) or isinstance(file,FieldStorageLike):
550 # We should check whether we got this file already or not
551 # If we dont have it, we add it
552 name = str(long(time.time() * 10**7))
553 ext = file.filename.rsplit('.',1)[:0:-1]
555 if ext:
556 ext = ext[0].lstrip(os.sep)
557 else:
558 # Panic, no extention found
559 ext = ''
560 return ''
562 # Make sure its something we want to have
564 extParams = meta.Session.query(Extension).filter(Extension.ext==ext).first()
566 if not extParams:
567 return False
569 localFilePath = os.path.join(g.OPT.uploadPath, name + '.' + ext)
570 localFile = open(localFilePath,'w+b')
571 shutil.copyfileobj(file.file, localFile)
572 localFile.seek(0)
573 md5 = hashlib.md5(localFile.read()).hexdigest()
574 file.file.close()
575 localFile.close()
577 pic = meta.Session.query(Picture).filter(Picture.md5==md5).first()
579 if pic:
580 os.unlink(localFilePath)
581 return [pic, False]
583 try:
584 if extParams.type == 'image':
585 thumbFilePath = name + 's.' + ext
586 size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize))
587 else:
588 if extParams.type == 'image-jpg':
589 thumbFilePath = name + 's.jpg'
590 size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize))
591 else:
592 thumbFilePath = extParams.path
593 size = [0, 0, extParams.thwidth, extParams.thheight]
594 except:
595 return [-1, AngryFileHolder(localFilePath)]
597 pic = Picture()
598 pic.path = name + '.' + ext
599 pic.thumpath = thumbFilePath
600 pic.width = size[0]
601 pic.height = size[1]
602 pic.thwidth = size[2]
603 pic.thheight = size[3]
604 pic.extid = extParams.id
605 pic.size = os.stat(localFilePath)[6]
606 pic.md5 = md5
607 meta.Session.save(pic)
608 meta.Session.commit()
609 return [pic, AngryFileHolder(localFilePath, pic)]
610 else:
611 return False
613 def makeThumbnail(self, source, dest, maxSize):
614 sourceImage = Image.open(source)
615 size = sourceImage.size
616 if sourceImage:
617 sourceImage.thumbnail(maxSize,Image.ANTIALIAS)
618 sourceImage.save(dest)
619 return size + sourceImage.size
620 else:
621 return []
622 def AddToArchive(self):
623 if self.options.thread and self.options.chan and self.options.chanTag:
624 if not self.options.board:
625 self.options.board = 'b'
626 entry = meta.Session.query(ArchiveList).filter(ArchiveList.tid==self.options.thread).filter(ArchiveList.url==self.options.chan).filter(ArchiveList.board==self.options.board).first()
627 if entry:
628 print "Thread is already in the list"
629 else:
630 entry = ArchiveList()
631 entry.tid = self.options.thread
632 entry.url = self.options.chan
633 entry.chanTag = self.options.chanTag
634 entry.board = self.options.board
635 entry.tags = self.options.tags or ''
636 entry.type = self.options.type or 'wakaba'
637 entry.filters = self.options.filters or ''
638 entry.timeDiff = self.options.timeDiff or 0
639 entry.lastChanged = datetime.datetime.fromtimestamp(0)
640 meta.Session.save(entry)
641 meta.Session.commit()
642 else:
643 print "Bad parameters"
644 def UpdateArchive(self):
645 archiveList = meta.Session.query(ArchiveList).all()
646 for entry in archiveList:
647 thread = Thread(entry,self.parsers)
648 state = thread.checkState()
649 print "*** Thread %s HTTP %s" % (thread.directlink,state[0])
650 if state[0] == 404:
651 meta.Session.delete(entry)
652 meta.Session.commit()
653 elif state[0] == 200:
654 self.processThread(thread)
655 entry.lastChanged = state[1]
656 meta.Session.commit()