From da9f843933cd66c477c4338f21a392f269188ce7 Mon Sep 17 00:00:00 2001 From: Dan Kluev Date: Tue, 18 Nov 2008 23:17:40 +1000 Subject: [PATCH] Fetching code for wakabas --- updateArchive.py | 656 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 656 insertions(+) create mode 100644 updateArchive.py diff --git a/updateArchive.py b/updateArchive.py new file mode 100644 index 0000000..b3ce414 --- /dev/null +++ b/updateArchive.py @@ -0,0 +1,656 @@ +# coding=utf-8 +from paste.script.command import Command +from fc.lib.base import * +from fc.model import * +from sqlalchemy.orm import eagerload +from sqlalchemy.orm import class_mapper +from sqlalchemy.sql import and_, or_, not_ +import sqlalchemy +import os +import cgi +import shutil +import datetime +import time +import Image +import hashlib +import re +from fc.lib.fuser import FUser +from fc.lib.miscUtils import * +from fc.lib.constantValues import * +from fc.lib.settings import * +from fc.lib.fileHolder import AngryFileHolder +import sys +import paste.fixture +import paste.registry +import paste.deploy.config +from paste.deploy import loadapp, appconfig +from paste.script.command import Command, BadCommand +from paste.script.filemaker import FileOp +from paste.script.pluginlib import find_egg_info_dir +import urllib2 +import httplib +from lxml import etree +import StringIO +from fc.model.arch import * +import logging + +def can_import(name): + """Attempt to __import__ the specified package/module, returning True when + succeeding, otherwise False""" + try: + __import__(name) + return True + except ImportError: + return False + +def unicodify(text): + if isinstance(text, str): + text = text.decode('utf-8') + return text + +idList = {} +GFilters = {} + +class DateTimeParser: + monthes = [('Янв','Jan','января'),('Фев','Feb','февраля'),('Мар','Mar','марта'),('Апр','Apr','апреля'),('Май','May','мая'),('Июн','Jun','июня'),('Июл','Jul','июля'),('Авг','Aug','августа'),('Сен','Sep','сентября'),('Окт','Oct','октября'),('Ноя','Nov','ноября'),('Дек','Dec','декабря')] + dateRe = re.compile(r"""[^\d]+(\d+)\s+([^\d\s]+)\s+(\d+)\s+(\d+)\:(\d+)\:(\d+)""") + dateReISO = re.compile(r"""(\d+)\-(\d+)\-(\d+) (\d+)\:(\d+)\:(\d+)""") + def getDateTime(self,date): + dateP = self.dateRe.findall(date) + dateP = dateP[0] + mi = 0 + f = False + for mm in self.monthes: + mi = mi + 1 + if dateP[1] in mm: + f = True + break + if f: + return datetime.datetime(int(dateP[2]),mi,int(dateP[0]),int(dateP[3]),int(dateP[4]),int(dateP[5])) + else: + return None + def getDateTimeFromISO8601(self,date): + dateP = self.dateReISO.findall(date) + dateP = dateP[0] + return datetime.datetime(int(dateP[0]),int(dateP[1]),int(dateP[2]),int(dateP[3]),int(dateP[4]),int(dateP[5])) + +DTP = DateTimeParser() + +class IBParser: + def GetNextTag(self,el,tag,skip=0): + tag = tag.lower() + if skip: + r = el.getnext() + else: + r = el + if not r.tag or r.tag.lower() != tag: + while (r.getnext() != None) and not (r.getnext().tag and r.getnext().tag.lower() == tag): + r = r.getnext() + if r.getnext() != None: + r = r.getnext() + if r.tag and r.tag.lower() == tag: + return r + else: + return None + + def GetPreviousTag(self,el,tag,skip=0): + tag = tag.lower() + if skip: + r = el.getprevious() + else: + r = el + if not r.tag or r.tag.lower() != tag: + while (r.getprevious() != None) and not (r.getprevious().tag and r.getprevious().tag.lower() == tag): + r = r.getprevious() + if r.getprevious() != None: + r = r.getprevious() + if r.tag and r.tag.lower() == tag: + return r + else: + return None + def ResolveSecondaryId(self,thread,Ids): + id = int(Ids[1]) + if id in idList: + return idList[id][0] + + tagsf = and_(Post.tags.any(tag=thread.chanTag),Post.tags.any(tag=thread.board)) + f2 = and_(Post.parentid==-1,tagsf) + f1 = and_(Post.secondaryIndex==Ids[0],f2) + thread = meta.Session.query(Post).filter(f1).first() + if thread: + if Ids[0] == Ids[1]: + return thread.id + else: + post = meta.Session.query(Post).filter(and_(Post.secondaryIndex==int(Ids[1]),Post.parentid==thread.id)).first() + if post: + return post.id + else: + return None + else: + return None + def GetPostID(self,post): + if post.thread: + ids = self.replyIdRe.findall(post.href) + return [post.thread.tid,int(ids[0])] + else: + ids = self.postIdRe.findall(post.href) + return [int(ids[0][0]),ids[0][2] and int(ids[0][2]) or int(ids[0][0])] + +class Loader: + def parseLink(self,link): + s1 = link.split('://') + p = len(s1)>1 and s1[0] or None + p2= p and (p+'://') or '' + s2 = s1[-1].split('/') + return [p, s2[0], p2 + s2[0] + '/', p2 + '/'.join(s2[:-1]) + '/', s2[-1],'/'+'/'.join(s2[1:])] + +class LoaderLocal(Loader): + def __init__(self,link): + p = self.parseLink(link) + self.relativeUrl = p[3] + def stat(self,link): + try: + stats = os.stat(link) + return [datetime.datetime.fromtimestamp(stats[8]),stats[6]] + except OSError: + return None + def get(self,url): + return open(url,'rb').read() + def getAbsolutePath(self,url): + return self.relativeUrl + url + def getFromRelative(self,url): + return self.get(self.getAbsolutePath(url)) + +class LoaderHTTP(Loader): + def __init__(self,link): + p = self.parseLink(link) + self.proto = p[0] + self.host = p[1] + self.baseUrl = p[2] + self.relativeUrl = p[3] + def stat(self,link): + linkp = self.parseLink(link) + c = httplib.HTTPConnection(linkp[1]) + c.request('HEAD', linkp[5]) + r = c.getresponse() + if r.status == 200: + size = r.getheader('content-length',0) + date = r.getheader('last-modified',r.getheader('date',None)) + return [DTP.getDateTime(date),size] + elif r.status == 404: + return None + else: + return None + def get(self,url): + req = urllib2.Request(url) + req.add_header('Referer', self.baseUrl) + try: + f = urllib2.urlopen(req) + res = f.read() + return res + except urllib2.HTTPError: + return None + def getAbsolutePath(self,url): + if url[0] == '/': + return self.baseUrl + url + else: + return self.relativeUrl + url + def getFromRelative(self,url): + return self.get(self.getAbsolutePath(url)) +class IBFilter: + def filter(self,post): + return None +class IBFilterSage(IBFilter): + def filter(self,post): + return post.sage +class IBFilterLowres(IBFilter): + def filter(self,post): + return post.pic and post.pic.width < 50 + +class Thread: + def __init__(self,entry,parsers,directlink=None,forcetype=None): + self.parser = parsers[entry.type] + self.tid = entry.tid + self.url = entry.url + self.board = entry.board + self.chanTag= entry.chanTag + self.tags = entry.tags and entry.tags.split(',') or [] + self.type = entry.type + self.forcetype = forcetype + self.lastChanged = entry.lastChanged + self.filters = [] + filters = entry.filters and entry.filters.split(',') or [] + if filters: + for f in filters: + self.filters.append(GFilters[f]) + + self.timeDiff = entry.timeDiff + self.directlink = directlink + self.loader = Loader() + if not self.directlink: + self.directlink = self.parser.GetThreadLink(self.url,self.board,self.tid) + if self.loader.parseLink(self.directlink)[0]: + self.loader = LoaderHTTP(self.directlink) + else: + self.loader = LoaderLocal(self.directlink) + def checkState(self): + stat = self.loader.stat(self.directlink) + if not stat: + return [404] + elif stat[0] > self.lastChanged: + return [200,stat[0],stat[1]] + else: + return [304,stat[0],stat[1]] + def initialize(self): + page = self.loader.get(self.directlink) + if page: + parser = etree.HTMLParser() + if isinstance(page, str): + page = page.decode('utf-8') + self.document = etree.parse(StringIO.StringIO(page), parser) + self.posts = self.parser.GetPostsList(self) + self.threadId = self.parser.ResolveSecondaryId(self,[self.tid,self.tid]) + if self.posts: + return True + else: + return False + else: + return False + def filter(self,post): + fl = None + if self.filters: + for f in self.filters: + fl = fl or f.filter(post) + return fl + def ReplaceReference(self,m): + mgg = m.groups() + mg = [mgg[1],mgg[2]] + tid = self.parser.ResolveSecondaryId(self,[mg[0],mg[0]]) + if tid: + if mg[0] != mg[1]: + pid = self.parser.ResolveSecondaryId(self,[mg[0],mg[1]]) + else: + pid = tid + if pid: + return '>>%s' % (tid, pid, pid, mg[1]) + print "ERROR! %s/%s does not exist!" % (mg[0],mg[1]) + return '>>%s' % (mg[0], mg[1], mg[1], mg[1]) + +class WakabaParser(IBParser): + replyIdRe = re.compile(r""">>(\d+)""") + postIdRe = re.compile(r"""\/(\d+)\.x?h?t?ml?(#i?(\d+))?""") + referenceRe = re.compile("""]*href="([^"]*/)?(\d+)\.[^"]+"[^>]*>\>\;\>\;(\d+)""") + def GetThreadLink(self,url,board,thread): + return 'http://'+url+'/'+board+'/res/'+str(thread)+'.html' + def GetPostsList(self,thread): + posts = thread.document.xpath("/html/body/form//*[@class='reflink']/a") + postsList = [] + if posts: + for postA in posts: + post = Post() + post.thread = thread + post.href = postA.get('href') + post.reflink = postA.getparent() + post.Ids = self.GetPostID(post) + post.secondaryIndex = int(post.Ids[1]) + postsList.append(post) + return postsList + else: + return None + def GetImgSrc(self,post): + cont = post.l.getparent() + for t in cont: + if t.tag.lower() == 'a': + href = t.get('href') + if href and href.find('/src/') != -1: + if post.thread.forcetype: + return '../src/' + post.thread.loader.parseLink(href)[4] + else: + return href + return None + + def ParseText(self,post): + if post.bq is not None: + post.bq.tail = '' + message = etree.tostring(post.bq, pretty_print=False,encoding='utf-8') + if message[:12].lower() == '
' and message[-13:].lower() == '
': + message = message[12:-13] + else: + print "Cant parse this message : '%s'" % message + return None + message = self.referenceRe.sub(post.thread.ReplaceReference,message) + return message + else: + return u'' + def parsePost(self,post): + post.bq = self.GetNextTag(post.reflink,'blockquote') + post.l = self.GetPreviousTag(post.reflink,'label') + post.title = unicodify(post.l[1].text) + if not post.title: + post.title = u'' + post.cpn = post.l[2] + post.sage = False + if len(post.cpn)>0 and post.cpn[0].tag.lower() == 'a': + post.cpnHref = post.cpn[0].get('href') + if post.cpnHref.find('sage') > -1: + post.sage = True + post.src = self.GetImgSrc(post) + date = post.l[2].tail.encode('utf-8') + date = date.replace("\r",'').replace("\n",'') + post.date = DTP.getDateTime(date) + post.message = unicodify(self.ParseText(post)) + +class UpdateArchive(Command): + # Parser configuration + summary = "--NO SUMMARY--" + usage = "--NO USAGE--" + group_name = "fc" + parser = Command.standard_parser(verbose=False) + parser.add_option("--mode") + parser.add_option("--chan") + parser.add_option("--board") + parser.add_option("--thread") + parser.add_option("--chanTag") + parser.add_option("--type") + parser.add_option("--tags") + parser.add_option("--timeDiff") + parser.add_option("--directlink") + parser.add_option("--list") + parser.add_option("--filters") + parser.add_option("--forcetype") + parsers = {'wakaba':WakabaParser()} + def command(self): + """Main command to create a new shell""" + self.verbose = 3 + config_file = 'development.ini' + config_name = 'config:%s' % config_file + here_dir = os.getcwd() + locs = dict(__name__="pylons-admin") + conf = appconfig(config_name, relative_to=here_dir) + conf.update(dict(app_conf=conf.local_conf,global_conf=conf.global_conf)) + paste.deploy.config.CONFIG.push_thread_config(conf) + sys.path.insert(0, here_dir) + wsgiapp = loadapp(config_name, relative_to=here_dir) + test_app = paste.fixture.TestApp(wsgiapp) + tresponse = test_app.get('/_test_vars') + request_id = int(tresponse.body) + test_app.pre_request_hook = lambda self:paste.registry.restorer.restoration_end() + test_app.post_request_hook = lambda self:paste.registry.restorer.restoration_begin(request_id) + paste.registry.restorer.restoration_begin(request_id) + egg_info = find_egg_info_dir(here_dir) + f = open(os.path.join(egg_info, 'top_level.txt')) + packages = [l.strip() for l in f.readlines() if l.strip() and not l.strip().startswith('#')] + f.close() + found_base = False + for pkg_name in packages: + # Import all objects from the base module + base_module = pkg_name + '.lib.base' + found_base = can_import(base_module) + if not found_base: + # Minimal template + base_module = pkg_name + '.controllers' + found_base = can_import(base_module) + + if found_base: + break + + if not found_base: + raise ImportError("Could not import base module. Are you sure this is a Pylons app?") + + base = sys.modules[base_module] + base_public = [__name for __name in dir(base) if not \ + __name.startswith('_') or __name == '_'] + for name in base_public: + locs[name] = getattr(base, name) + locs.update(dict(wsgiapp=wsgiapp, app=test_app)) + + mapper = tresponse.config.get('routes.map') + if mapper: + locs['mapper'] = mapper + + + self.thread = self.options.thread + self.chan = self.options.chan + self.chanTag = self.options.chanTag + self.board = self.options.board + + logging.getLogger('sqlalchemy').setLevel(logging.ERROR) + GFilters['sage'] = IBFilterSage() + GFilters['lowres'] = IBFilterLowres() + #logging.getLogger( 'sqlalchemy').setLevel( logging.NONE ) + if not self.options.mode or self.options.mode == 'update': + self.UpdateArchive() + elif self.options.mode == 'add': + self.AddToArchive() + elif self.options.mode == 'thread': + if self.options.list: + f = open(self.options.list,'r') + tList = f.readlines() + else: + tList = [self.options.thread] + for t in tList: + entry = ArchiveList() + entry.tid = int(t) + entry.url = self.options.chan + entry.chanTag = self.options.chanTag + entry.board = self.options.board or 'b' + entry.tags = self.options.tags or '' + entry.type = self.options.type or 'wakaba' + entry.filters = self.options.filters or '' + entry.timeDiff = self.options.timeDiff or 0 + entry.lastChanged = datetime.datetime.fromtimestamp(0) + print "Processing %s %s %s %s" % (entry.tid,entry.url,entry.chanTag,entry.board) + thread = Thread(entry,self.parsers,self.options.directlink,self.options.forcetype) + self.processThread(thread) + + def LoadPage(self,thread,chan='2ch.ru',board='b'): + self.host = 'http://'+chan + if thread: + self.path = '/'+board+'/res/' + self.url = self.host+self.path+thread+'.html' + else: + self.path = '/'+board+'/' + self.url = self.host+self.path + print self.url + req = urllib2.Request(self.url) + req.add_header('Referer', self.host+'/'+board+'/') + f = urllib2.urlopen(req) + res = f.read() + return res + + def getTags(self,tagsList): + tags = [] + for tagName in tagsList: + tag = meta.Session.query(Tag).filter(Tag.tag==tagName).first() + if tag: + tags.append(tag) + else: + tags.append(Tag(tagName)) + return tags + + def processPost(self,post): + post.thread.parser.parsePost(post) + post.pic = False + if post.src: + post.pic = self.LoadImage(post) + if post.pic == -1: + post.pic = None + if post.pic: + post.picid = post.pic.id + print "Thread %s Post %s (Image:%s %s %sx%s) at %s, sage : %s" % (post.Ids[0],post.Ids[1],post.src,post.pic and post.pic.id or 0,post.pic and post.pic.width or 0,post.pic and post.pic.height or 0,post.date,post.sage) + if (post.thread.filter(post)): + print "Filtered out" + print "----------------------" + else: + if post.Ids[0] == post.Ids[1]: + post.parentid = -1 + post.replyCount = 0 + post.bumpDate = post.date + post.tags = self.getTags([post.thread.chanTag,post.thread.board]+post.thread.tags) + post.thread.post = post + else: + post.parentid = post.thread.post.id + if not post.sage: + post.thread.post.bumpDate = post.date + post.thread.post.replyCount += 1 + post.uidNumber = 1 + meta.Session.save(post) + meta.Session.commit() + idList[post.Ids[1]]=[post.id,post.Ids[0]] + print "Saved in DB as %s/%s" % (post.id,post.parentid) + print "----------------------" + + def processThread(self,thread): + if thread.initialize(): + if thread.threadId: + thread.post = meta.Session.query(Post).get(thread.threadId) + lastPost = meta.Session.query(Post).filter(Post.parentid==thread.post.id).filter(Post.secondaryIndex>0).order_by(Post.secondaryIndex.desc()).first() + if lastPost: + lastId = lastPost.secondaryIndex + else: + lastId = int(thread.tid) + else: + lastId = 0 + skipped = 0 + for post in thread.posts: + if int(post.Ids[1]) > lastId: + if skipped: + print "Skipped %s out of %s posts" % (skipped,len(thread.posts)) + skipped=0 + self.processPost(post) + else: + skipped += 1 + if skipped: + print "Skipped %s out of %s posts" % (skipped,len(thread.posts)) + + + def LoadImage(self,post): + url = post.thread.loader.getAbsolutePath(post.src) + fileName = post.thread.loader.parseLink(url)[4] + res = post.thread.loader.getFromRelative(post.src) + if res: + localFilePath = os.path.join(g.OPT.uploadPath, fileName) + localFile = open(localFilePath,'wb') + localFile.write(res) + localFile.close() + file = FieldStorageLike(fileName,localFilePath) + fileDescriptors = self.processFile(file, 200) + pic = False + if fileDescriptors: + pic = fileDescriptors[0] + fileHolder = fileDescriptors[1] + if pic and pic != -1 and fileHolder: + fileHolder.disableDeletion() + return pic + else: + return None + + def processFile(self, file, thumbSize=250): + if isinstance(file, cgi.FieldStorage) or isinstance(file,FieldStorageLike): + # We should check whether we got this file already or not + # If we dont have it, we add it + name = str(long(time.time() * 10**7)) + ext = file.filename.rsplit('.',1)[:0:-1] + + if ext: + ext = ext[0].lstrip(os.sep) + else: + # Panic, no extention found + ext = '' + return '' + + # Make sure its something we want to have + + extParams = meta.Session.query(Extension).filter(Extension.ext==ext).first() + + if not extParams: + return False + + localFilePath = os.path.join(g.OPT.uploadPath, name + '.' + ext) + localFile = open(localFilePath,'w+b') + shutil.copyfileobj(file.file, localFile) + localFile.seek(0) + md5 = hashlib.md5(localFile.read()).hexdigest() + file.file.close() + localFile.close() + + pic = meta.Session.query(Picture).filter(Picture.md5==md5).first() + + if pic: + os.unlink(localFilePath) + return [pic, False] + + try: + if extParams.type == 'image': + thumbFilePath = name + 's.' + ext + size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize)) + else: + if extParams.type == 'image-jpg': + thumbFilePath = name + 's.jpg' + size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize)) + else: + thumbFilePath = extParams.path + size = [0, 0, extParams.thwidth, extParams.thheight] + except: + return [-1, AngryFileHolder(localFilePath)] + + pic = Picture() + pic.path = name + '.' + ext + pic.thumpath = thumbFilePath + pic.width = size[0] + pic.height = size[1] + pic.thwidth = size[2] + pic.thheight = size[3] + pic.extid = extParams.id + pic.size = os.stat(localFilePath)[6] + pic.md5 = md5 + meta.Session.save(pic) + meta.Session.commit() + return [pic, AngryFileHolder(localFilePath, pic)] + else: + return False + + def makeThumbnail(self, source, dest, maxSize): + sourceImage = Image.open(source) + size = sourceImage.size + if sourceImage: + sourceImage.thumbnail(maxSize,Image.ANTIALIAS) + sourceImage.save(dest) + return size + sourceImage.size + else: + return [] + def AddToArchive(self): + if self.options.thread and self.options.chan and self.options.chanTag: + if not self.options.board: + self.options.board = 'b' + entry = meta.Session.query(ArchiveList).filter(ArchiveList.tid==self.options.thread).filter(ArchiveList.url==self.options.chan).filter(ArchiveList.board==self.options.board).first() + if entry: + print "Thread is already in the list" + else: + entry = ArchiveList() + entry.tid = self.options.thread + entry.url = self.options.chan + entry.chanTag = self.options.chanTag + entry.board = self.options.board + entry.tags = self.options.tags or '' + entry.type = self.options.type or 'wakaba' + entry.filters = self.options.filters or '' + entry.timeDiff = self.options.timeDiff or 0 + entry.lastChanged = datetime.datetime.fromtimestamp(0) + meta.Session.save(entry) + meta.Session.commit() + else: + print "Bad parameters" + def UpdateArchive(self): + archiveList = meta.Session.query(ArchiveList).all() + for entry in archiveList: + thread = Thread(entry,self.parsers) + state = thread.checkState() + print "*** Thread %s HTTP %s" % (thread.directlink,state[0]) + if state[0] == 404: + meta.Session.delete(entry) + meta.Session.commit() + elif state[0] == 200: + self.processThread(thread) + entry.lastChanged = state[1] + meta.Session.commit() -- 2.11.4.GIT