Test
[chans.py.git] / updateArchive.py
blob6322238d5133e81b4d566f075c4f6385529e402c
1 # coding=utf-8
2 #fgsfds
3 from paste.script.command import Command
4 from fc.lib.base import *
5 from fc.model import *
6 from sqlalchemy.orm import eagerload
7 from sqlalchemy.orm import class_mapper
8 from sqlalchemy.sql import and_, or_, not_
9 import sqlalchemy
10 import os
11 import cgi
12 import shutil
13 import datetime
14 import time
15 import Image
16 import hashlib
17 import re
19 from fc.lib.fuser import FUser
20 from fc.lib.miscUtils import *
21 from fc.lib.constantValues import *
22 from fc.lib.settings import *
23 from fc.lib.fileHolder import AngryFileHolder
24 import sys
25 import paste.fixture
26 import paste.registry
27 import paste.deploy.config
28 from paste.deploy import loadapp, appconfig
29 from paste.script.command import Command, BadCommand
30 from paste.script.filemaker import FileOp
31 from paste.script.pluginlib import find_egg_info_dir
32 import urllib2
33 import httplib
34 from lxml import etree
35 import StringIO
36 from fc.model.arch import *
37 import logging
39 def can_import(name):
40 """Attempt to __import__ the specified package/module, returning True when
41 succeeding, otherwise False"""
42 try:
43 __import__(name)
44 return True
45 except ImportError:
46 return False
48 def unicodify(text):
49 if isinstance(text, str):
50 text = text.decode('utf-8')
51 return text
53 idList = {}
54 GFilters = {}
56 class DateTimeParser:
57 monthes = [('Янв','Jan','января'),('Фев','Feb','февраля'),('Мар','Mar','марта'),('Апр','Apr','апреля'),('Май','May','мая'),('Июн','Jun','июня'),('Июл','Jul','июля'),('Авг','Aug','августа'),('Сен','Sep','сентября'),('Окт','Oct','октября'),('Ноя','Nov','ноября'),('Дек','Dec','декабря')]
58 dateRe = re.compile(r"""[^\d]+(\d+)\s+([^\d\s]+)\s+(\d+)\s+(\d+)\:(\d+)\:(\d+)""")
59 dateReISO = re.compile(r"""(\d+)\-(\d+)\-(\d+) (\d+)\:(\d+)\:(\d+)""")
60 def getDateTime(self,date):
61 dateP = self.dateRe.findall(date)
62 dateP = dateP[0]
63 mi = 0
64 f = False
65 for mm in self.monthes:
66 mi = mi + 1
67 if dateP[1] in mm:
68 f = True
69 break
70 if f:
71 return datetime.datetime(int(dateP[2]),mi,int(dateP[0]),int(dateP[3]),int(dateP[4]),int(dateP[5]))
72 else:
73 return None
74 def getDateTimeFromISO8601(self,date):
75 dateP = self.dateReISO.findall(date)
76 dateP = dateP[0]
77 return datetime.datetime(int(dateP[0]),int(dateP[1]),int(dateP[2]),int(dateP[3]),int(dateP[4]),int(dateP[5]))
79 DTP = DateTimeParser()
81 class IBParser:
82 def GetNextTag(self,el,tag,skip=0):
83 tag = tag.lower()
84 if skip:
85 r = el.getnext()
86 else:
87 r = el
88 if not r.tag or r.tag.lower() != tag:
89 while (r.getnext() != None) and not (r.getnext().tag and r.getnext().tag.lower() == tag):
90 r = r.getnext()
91 if r.getnext() != None:
92 r = r.getnext()
93 if r.tag and r.tag.lower() == tag:
94 return r
95 else:
96 return None
98 def GetPreviousTag(self,el,tag,skip=0):
99 tag = tag.lower()
100 if skip:
101 r = el.getprevious()
102 else:
103 r = el
104 if not r.tag or r.tag.lower() != tag:
105 while (r.getprevious() != None) and not (r.getprevious().tag and r.getprevious().tag.lower() == tag):
106 r = r.getprevious()
107 if r.getprevious() != None:
108 r = r.getprevious()
109 if r.tag and r.tag.lower() == tag:
110 return r
111 else:
112 return None
113 def ResolveSecondaryId(self,thread,Ids):
114 id = int(Ids[1])
115 if id in idList:
116 return idList[id][0]
118 tagsf = and_(Post.tags.any(tag=thread.chanTag),Post.tags.any(tag=thread.board))
119 f2 = and_(Post.parentid==-1,tagsf)
120 f1 = and_(Post.secondaryIndex==Ids[0],f2)
121 thread = meta.Session.query(Post).filter(f1).first()
122 if thread:
123 if Ids[0] == Ids[1]:
124 return thread.id
125 else:
126 post = meta.Session.query(Post).filter(and_(Post.secondaryIndex==int(Ids[1]),Post.parentid==thread.id)).first()
127 if post:
128 return post.id
129 else:
130 return None
131 else:
132 return None
133 def GetPostID(self,post):
134 if post.thread:
135 ids = self.replyIdRe.findall(post.href)
136 return [post.thread.tid,int(ids[0])]
137 else:
138 ids = self.postIdRe.findall(post.href)
139 return [int(ids[0][0]),ids[0][2] and int(ids[0][2]) or int(ids[0][0])]
141 class Loader:
142 def parseLink(self,link):
143 s1 = link.split('://')
144 p = len(s1)>1 and s1[0] or None
145 p2= p and (p+'://') or ''
146 s2 = s1[-1].split('/')
147 return [p, s2[0], p2 + s2[0] + '/', p2 + '/'.join(s2[:-1]) + '/', s2[-1],'/'+'/'.join(s2[1:])]
149 class LoaderLocal(Loader):
150 def __init__(self,link):
151 p = self.parseLink(link)
152 self.relativeUrl = p[3]
153 def stat(self,link):
154 try:
155 stats = os.stat(link)
156 return [datetime.datetime.fromtimestamp(stats[8]),stats[6]]
157 except OSError:
158 return None
159 def get(self,url):
160 return open(url,'rb').read()
161 def getAbsolutePath(self,url):
162 return self.relativeUrl + url
163 def getFromRelative(self,url):
164 return self.get(self.getAbsolutePath(url))
166 class LoaderHTTP(Loader):
167 def __init__(self,link):
168 p = self.parseLink(link)
169 self.proto = p[0]
170 self.host = p[1]
171 self.baseUrl = p[2]
172 self.relativeUrl = p[3]
173 def stat(self,link):
174 linkp = self.parseLink(link)
175 c = httplib.HTTPConnection(linkp[1])
176 c.request('HEAD', linkp[5])
177 r = c.getresponse()
178 if r.status == 200:
179 size = r.getheader('content-length',0)
180 date = r.getheader('last-modified',r.getheader('date',None))
181 return [DTP.getDateTime(date),size]
182 elif r.status == 404:
183 return None
184 else:
185 return None
186 def get(self,url):
187 req = urllib2.Request(url)
188 req.add_header('Referer', self.baseUrl)
189 try:
190 f = urllib2.urlopen(req)
191 res = f.read()
192 return res
193 except urllib2.HTTPError:
194 return None
195 def getAbsolutePath(self,url):
196 if url[0] == '/':
197 return self.baseUrl + url
198 else:
199 return self.relativeUrl + url
200 def getFromRelative(self,url):
201 return self.get(self.getAbsolutePath(url))
202 class IBFilter:
203 def filter(self,post):
204 return None
205 class IBFilterSage(IBFilter):
206 def filter(self,post):
207 return post.sage
208 class IBFilterLowres(IBFilter):
209 def filter(self,post):
210 return post.pic and post.pic.width < 50
212 class Thread:
213 def __init__(self,entry,parsers,directlink=None,forcetype=None):
214 self.parser = parsers[entry.type]
215 self.tid = entry.tid
216 self.url = entry.url
217 self.board = entry.board
218 self.chanTag= entry.chanTag
219 self.tags = entry.tags and entry.tags.split(',') or []
220 self.type = entry.type
221 self.forcetype = forcetype
222 self.lastChanged = entry.lastChanged
223 self.filters = []
224 filters = entry.filters and entry.filters.split(',') or []
225 if filters:
226 for f in filters:
227 self.filters.append(GFilters[f])
229 self.timeDiff = entry.timeDiff
230 self.directlink = directlink
231 self.loader = Loader()
232 if not self.directlink:
233 self.directlink = self.parser.GetThreadLink(self.url,self.board,self.tid)
234 if self.loader.parseLink(self.directlink)[0]:
235 self.loader = LoaderHTTP(self.directlink)
236 else:
237 self.loader = LoaderLocal(self.directlink)
238 def checkState(self):
239 stat = self.loader.stat(self.directlink)
240 if not stat:
241 return [404]
242 elif stat[0] > self.lastChanged:
243 return [200,stat[0],stat[1]]
244 else:
245 return [304,stat[0],stat[1]]
246 def initialize(self):
247 page = self.loader.get(self.directlink)
248 if page:
249 parser = etree.HTMLParser()
250 if isinstance(page, str):
251 page = page.decode('utf-8')
252 self.document = etree.parse(StringIO.StringIO(page), parser)
253 self.posts = self.parser.GetPostsList(self)
254 self.threadId = self.parser.ResolveSecondaryId(self,[self.tid,self.tid])
255 if self.posts:
256 return True
257 else:
258 return False
259 else:
260 return False
261 def filter(self,post):
262 fl = None
263 if self.filters:
264 for f in self.filters:
265 fl = fl or f.filter(post)
266 return fl
267 def ReplaceReference(self,m):
268 mgg = m.groups()
269 mg = [mgg[1],mgg[2]]
270 tid = self.parser.ResolveSecondaryId(self,[mg[0],mg[0]])
271 if tid:
272 if mg[0] != mg[1]:
273 pid = self.parser.ResolveSecondaryId(self,[mg[0],mg[1]])
274 else:
275 pid = tid
276 if pid:
277 return '<a href="/%s#i%s" onclick="highlight(%s)">&gt;&gt;%s</a>' % (tid, pid, pid, mg[1])
278 print "ERROR! %s/%s does not exist!" % (mg[0],mg[1])
279 return '<a href="/secondaryIndex/%s#i%s" onclick="highlight(%s)">&gt;&gt;%s</a>' % (mg[0], mg[1], mg[1], mg[1])
281 class WakabaParser(IBParser):
282 replyIdRe = re.compile(r""">>(\d+)""")
283 postIdRe = re.compile(r"""\/(\d+)\.x?h?t?ml?(#i?(\d+))?""")
284 referenceRe = re.compile("""<a [^>]*href="([^"]*/)?(\d+)\.[^"]+"[^>]*>\&gt\;\&gt\;(\d+)</a>""")
285 def GetThreadLink(self,url,board,thread):
286 return 'http://'+url+'/'+board+'/res/'+str(thread)+'.html'
287 def GetPostsList(self,thread):
288 posts = thread.document.xpath("/html/body/form//*[@class='reflink']/a")
289 postsList = []
290 if posts:
291 for postA in posts:
292 post = Post()
293 post.thread = thread
294 post.href = postA.get('href')
295 post.reflink = postA.getparent()
296 post.Ids = self.GetPostID(post)
297 post.secondaryIndex = int(post.Ids[1])
298 postsList.append(post)
299 return postsList
300 else:
301 return None
302 def GetImgSrc(self,post):
303 cont = post.l.getparent()
304 for t in cont:
305 if t.tag.lower() == 'a':
306 href = t.get('href')
307 if href and href.find('/src/') != -1:
308 if post.thread.forcetype:
309 return '../src/' + post.thread.loader.parseLink(href)[4]
310 else:
311 return href
312 return None
314 def ParseText(self,post):
315 if post.bq is not None:
316 post.bq.tail = ''
317 message = etree.tostring(post.bq, pretty_print=False,encoding='utf-8')
318 if message[:12].lower() == '<blockquote>' and message[-13:].lower() == '</blockquote>':
319 message = message[12:-13]
320 else:
321 print "Cant parse this message : '%s'" % message
322 return None
323 message = self.referenceRe.sub(post.thread.ReplaceReference,message)
324 return message
325 else:
326 return u''
327 def parsePost(self,post):
328 post.bq = self.GetNextTag(post.reflink,'blockquote')
329 post.l = self.GetPreviousTag(post.reflink,'label')
330 post.title = unicodify(post.l[1].text)
331 if not post.title:
332 post.title = u''
333 post.cpn = post.l[2]
334 post.sage = False
335 if len(post.cpn)>0 and post.cpn[0].tag.lower() == 'a':
336 post.cpnHref = post.cpn[0].get('href')
337 if post.cpnHref.find('sage') > -1:
338 post.sage = True
339 post.src = self.GetImgSrc(post)
340 date = post.l[2].tail.encode('utf-8')
341 date = date.replace("\r",'').replace("\n",'')
342 post.date = DTP.getDateTime(date)
343 post.message = unicodify(self.ParseText(post))
345 class UpdateArchive(Command):
346 # Parser configuration
347 summary = "--NO SUMMARY--"
348 usage = "--NO USAGE--"
349 group_name = "fc"
350 parser = Command.standard_parser(verbose=False)
351 parser.add_option("--mode")
352 parser.add_option("--chan")
353 parser.add_option("--board")
354 parser.add_option("--thread")
355 parser.add_option("--chanTag")
356 parser.add_option("--type")
357 parser.add_option("--tags")
358 parser.add_option("--timeDiff")
359 parser.add_option("--directlink")
360 parser.add_option("--list")
361 parser.add_option("--filters")
362 parser.add_option("--forcetype")
363 parsers = {'wakaba':WakabaParser()}
364 def command(self):
365 """Main command to create a new shell"""
366 self.verbose = 3
367 config_file = 'development.ini'
368 config_name = 'config:%s' % config_file
369 here_dir = os.getcwd()
370 locs = dict(__name__="pylons-admin")
371 conf = appconfig(config_name, relative_to=here_dir)
372 conf.update(dict(app_conf=conf.local_conf,global_conf=conf.global_conf))
373 paste.deploy.config.CONFIG.push_thread_config(conf)
374 sys.path.insert(0, here_dir)
375 wsgiapp = loadapp(config_name, relative_to=here_dir)
376 test_app = paste.fixture.TestApp(wsgiapp)
377 tresponse = test_app.get('/_test_vars')
378 request_id = int(tresponse.body)
379 test_app.pre_request_hook = lambda self:paste.registry.restorer.restoration_end()
380 test_app.post_request_hook = lambda self:paste.registry.restorer.restoration_begin(request_id)
381 paste.registry.restorer.restoration_begin(request_id)
382 egg_info = find_egg_info_dir(here_dir)
383 f = open(os.path.join(egg_info, 'top_level.txt'))
384 packages = [l.strip() for l in f.readlines() if l.strip() and not l.strip().startswith('#')]
385 f.close()
386 found_base = False
387 for pkg_name in packages:
388 # Import all objects from the base module
389 base_module = pkg_name + '.lib.base'
390 found_base = can_import(base_module)
391 if not found_base:
392 # Minimal template
393 base_module = pkg_name + '.controllers'
394 found_base = can_import(base_module)
396 if found_base:
397 break
399 if not found_base:
400 raise ImportError("Could not import base module. Are you sure this is a Pylons app?")
402 base = sys.modules[base_module]
403 base_public = [__name for __name in dir(base) if not \
404 __name.startswith('_') or __name == '_']
405 for name in base_public:
406 locs[name] = getattr(base, name)
407 locs.update(dict(wsgiapp=wsgiapp, app=test_app))
409 mapper = tresponse.config.get('routes.map')
410 if mapper:
411 locs['mapper'] = mapper
414 self.thread = self.options.thread
415 self.chan = self.options.chan
416 self.chanTag = self.options.chanTag
417 self.board = self.options.board
419 logging.getLogger('sqlalchemy').setLevel(logging.ERROR)
420 GFilters['sage'] = IBFilterSage()
421 GFilters['lowres'] = IBFilterLowres()
422 #logging.getLogger( 'sqlalchemy').setLevel( logging.NONE )
423 if not self.options.mode or self.options.mode == 'update':
424 self.UpdateArchive()
425 elif self.options.mode == 'add':
426 self.AddToArchive()
427 elif self.options.mode == 'thread':
428 if self.options.list:
429 f = open(self.options.list,'r')
430 tList = f.readlines()
431 else:
432 tList = [self.options.thread]
433 for t in tList:
434 entry = ArchiveList()
435 entry.tid = int(t)
436 entry.url = self.options.chan
437 entry.chanTag = self.options.chanTag
438 entry.board = self.options.board or 'b'
439 entry.tags = self.options.tags or ''
440 entry.type = self.options.type or 'wakaba'
441 entry.filters = self.options.filters or ''
442 entry.timeDiff = self.options.timeDiff or 0
443 entry.lastChanged = datetime.datetime.fromtimestamp(0)
444 print "Processing %s %s %s %s" % (entry.tid,entry.url,entry.chanTag,entry.board)
445 thread = Thread(entry,self.parsers,self.options.directlink,self.options.forcetype)
446 self.processThread(thread)
448 def LoadPage(self,thread,chan='2ch.ru',board='b'):
449 self.host = 'http://'+chan
450 if thread:
451 self.path = '/'+board+'/res/'
452 self.url = self.host+self.path+thread+'.html'
453 else:
454 self.path = '/'+board+'/'
455 self.url = self.host+self.path
456 print self.url
457 req = urllib2.Request(self.url)
458 req.add_header('Referer', self.host+'/'+board+'/')
459 f = urllib2.urlopen(req)
460 res = f.read()
461 return res
463 def getTags(self,tagsList):
464 tags = []
465 for tagName in tagsList:
466 tag = meta.Session.query(Tag).filter(Tag.tag==tagName).first()
467 if tag:
468 tags.append(tag)
469 else:
470 tags.append(Tag(tagName))
471 return tags
473 def processPost(self,post):
474 post.thread.parser.parsePost(post)
475 post.pic = False
476 if post.src:
477 post.pic = self.LoadImage(post)
478 if post.pic == -1:
479 post.pic = None
480 if post.pic:
481 post.picid = post.pic.id
482 print "Thread %s Post %s (Image:%s %s %sx%s) at %s, sage : %s" % (post.Ids[0],post.Ids[1],post.src,post.pic and post.pic.id or 0,post.pic and post.pic.width or 0,post.pic and post.pic.height or 0,post.date,post.sage)
483 if (post.thread.filter(post)):
484 print "Filtered out"
485 print "----------------------"
486 else:
487 if post.Ids[0] == post.Ids[1]:
488 post.parentid = -1
489 post.replyCount = 0
490 post.bumpDate = post.date
491 post.tags = self.getTags([post.thread.chanTag,post.thread.board]+post.thread.tags)
492 post.thread.post = post
493 else:
494 post.parentid = post.thread.post.id
495 if not post.sage:
496 post.thread.post.bumpDate = post.date
497 post.thread.post.replyCount += 1
498 post.uidNumber = 1
499 meta.Session.save(post)
500 meta.Session.commit()
501 idList[post.Ids[1]]=[post.id,post.Ids[0]]
502 print "Saved in DB as %s/%s" % (post.id,post.parentid)
503 print "----------------------"
505 def processThread(self,thread):
506 if thread.initialize():
507 if thread.threadId:
508 thread.post = meta.Session.query(Post).get(thread.threadId)
509 lastPost = meta.Session.query(Post).filter(Post.parentid==thread.post.id).filter(Post.secondaryIndex>0).order_by(Post.secondaryIndex.desc()).first()
510 if lastPost:
511 lastId = lastPost.secondaryIndex
512 else:
513 lastId = int(thread.tid)
514 else:
515 lastId = 0
516 skipped = 0
517 for post in thread.posts:
518 if int(post.Ids[1]) > lastId:
519 if skipped:
520 print "Skipped %s out of %s posts" % (skipped,len(thread.posts))
521 skipped=0
522 self.processPost(post)
523 else:
524 skipped += 1
525 if skipped:
526 print "Skipped %s out of %s posts" % (skipped,len(thread.posts))
529 def LoadImage(self,post):
530 url = post.thread.loader.getAbsolutePath(post.src)
531 fileName = post.thread.loader.parseLink(url)[4]
532 res = post.thread.loader.getFromRelative(post.src)
533 if res:
534 localFilePath = os.path.join(g.OPT.uploadPath, fileName)
535 localFile = open(localFilePath,'wb')
536 localFile.write(res)
537 localFile.close()
538 file = FieldStorageLike(fileName,localFilePath)
539 fileDescriptors = self.processFile(file, 200)
540 pic = False
541 if fileDescriptors:
542 pic = fileDescriptors[0]
543 fileHolder = fileDescriptors[1]
544 if pic and pic != -1 and fileHolder:
545 fileHolder.disableDeletion()
546 return pic
547 else:
548 return None
550 def processFile(self, file, thumbSize=250):
551 if isinstance(file, cgi.FieldStorage) or isinstance(file,FieldStorageLike):
552 # We should check whether we got this file already or not
553 # If we dont have it, we add it
554 name = str(long(time.time() * 10**7))
555 ext = file.filename.rsplit('.',1)[:0:-1]
557 if ext:
558 ext = ext[0].lstrip(os.sep)
559 else:
560 # Panic, no extention found
561 ext = ''
562 return ''
564 # Make sure its something we want to have
566 extParams = meta.Session.query(Extension).filter(Extension.ext==ext).first()
568 if not extParams:
569 return False
571 localFilePath = os.path.join(g.OPT.uploadPath, name + '.' + ext)
572 localFile = open(localFilePath,'w+b')
573 shutil.copyfileobj(file.file, localFile)
574 localFile.seek(0)
575 md5 = hashlib.md5(localFile.read()).hexdigest()
576 file.file.close()
577 localFile.close()
579 pic = meta.Session.query(Picture).filter(Picture.md5==md5).first()
581 if pic:
582 os.unlink(localFilePath)
583 return [pic, False]
585 try:
586 if extParams.type == 'image':
587 thumbFilePath = name + 's.' + ext
588 size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize))
589 else:
590 if extParams.type == 'image-jpg':
591 thumbFilePath = name + 's.jpg'
592 size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize))
593 else:
594 thumbFilePath = extParams.path
595 size = [0, 0, extParams.thwidth, extParams.thheight]
596 except:
597 return [-1, AngryFileHolder(localFilePath)]
599 pic = Picture()
600 pic.path = name + '.' + ext
601 pic.thumpath = thumbFilePath
602 pic.width = size[0]
603 pic.height = size[1]
604 pic.thwidth = size[2]
605 pic.thheight = size[3]
606 pic.extid = extParams.id
607 pic.size = os.stat(localFilePath)[6]
608 pic.md5 = md5
609 meta.Session.save(pic)
610 meta.Session.commit()
611 return [pic, AngryFileHolder(localFilePath, pic)]
612 else:
613 return False
615 def makeThumbnail(self, source, dest, maxSize):
616 sourceImage = Image.open(source)
617 size = sourceImage.size
618 if sourceImage:
619 sourceImage.thumbnail(maxSize,Image.ANTIALIAS)
620 sourceImage.save(dest)
621 return size + sourceImage.size
622 else:
623 return []
624 def AddToArchive(self):
625 if self.options.thread and self.options.chan and self.options.chanTag:
626 if not self.options.board:
627 self.options.board = 'b'
628 entry = meta.Session.query(ArchiveList).filter(ArchiveList.tid==self.options.thread).filter(ArchiveList.url==self.options.chan).filter(ArchiveList.board==self.options.board).first()
629 if entry:
630 print "Thread is already in the list"
631 else:
632 entry = ArchiveList()
633 entry.tid = self.options.thread
634 entry.url = self.options.chan
635 entry.chanTag = self.options.chanTag
636 entry.board = self.options.board
637 entry.tags = self.options.tags or ''
638 entry.type = self.options.type or 'wakaba'
639 entry.filters = self.options.filters or ''
640 entry.timeDiff = self.options.timeDiff or 0
641 entry.lastChanged = datetime.datetime.fromtimestamp(0)
642 meta.Session.save(entry)
643 meta.Session.commit()
644 else:
645 print "Bad parameters"
646 def UpdateArchive(self):
647 archiveList = meta.Session.query(ArchiveList).all()
648 for entry in archiveList:
649 thread = Thread(entry,self.parsers)
650 state = thread.checkState()
651 print "*** Thread %s HTTP %s" % (thread.directlink,state[0])
652 if state[0] == 404:
653 meta.Session.delete(entry)
654 meta.Session.commit()
655 elif state[0] == 200:
656 self.processThread(thread)
657 entry.lastChanged = state[1]
658 meta.Session.commit()