2 from paste
.script
.command
import Command
3 from fc
.lib
.base
import *
5 from sqlalchemy
.orm
import eagerload
6 from sqlalchemy
.orm
import class_mapper
7 from sqlalchemy
.sql
import and_
, or_
, not_
17 from fc
.lib
.fuser
import FUser
18 from fc
.lib
.miscUtils
import *
19 from fc
.lib
.constantValues
import *
20 from fc
.lib
.settings
import *
21 from fc
.lib
.fileHolder
import AngryFileHolder
25 import paste
.deploy
.config
26 from paste
.deploy
import loadapp
, appconfig
27 from paste
.script
.command
import Command
, BadCommand
28 from paste
.script
.filemaker
import FileOp
29 from paste
.script
.pluginlib
import find_egg_info_dir
32 from lxml
import etree
34 from fc
.model
.arch
import *
38 """Attempt to __import__ the specified package/module, returning True when
39 succeeding, otherwise False"""
47 if isinstance(text
, str):
48 text
= text
.decode('utf-8')
55 monthes
= [('Янв','Jan','января'),('Фев','Feb','февраля'),('Мар','Mar','марта'),('Апр','Apr','апреля'),('Май','May','мая'),('Июн','Jun','июня'),('Июл','Jul','июля'),('Авг','Aug','августа'),('Сен','Sep','сентября'),('Окт','Oct','октября'),('Ноя','Nov','ноября'),('Дек','Dec','декабря')]
56 dateRe
= re
.compile(r
"""[^\d]+(\d+)\s+([^\d\s]+)\s+(\d+)\s+(\d+)\:(\d+)\:(\d+)""")
57 dateReISO
= re
.compile(r
"""(\d+)\-(\d+)\-(\d+) (\d+)\:(\d+)\:(\d+)""")
58 def getDateTime(self
,date
):
59 dateP
= self
.dateRe
.findall(date
)
63 for mm
in self
.monthes
:
69 return datetime
.datetime(int(dateP
[2]),mi
,int(dateP
[0]),int(dateP
[3]),int(dateP
[4]),int(dateP
[5]))
72 def getDateTimeFromISO8601(self
,date
):
73 dateP
= self
.dateReISO
.findall(date
)
75 return datetime
.datetime(int(dateP
[0]),int(dateP
[1]),int(dateP
[2]),int(dateP
[3]),int(dateP
[4]),int(dateP
[5]))
77 DTP
= DateTimeParser()
80 def GetNextTag(self
,el
,tag
,skip
=0):
86 if not r
.tag
or r
.tag
.lower() != tag
:
87 while (r
.getnext() != None) and not (r
.getnext().tag
and r
.getnext().tag
.lower() == tag
):
89 if r
.getnext() != None:
91 if r
.tag
and r
.tag
.lower() == tag
:
96 def GetPreviousTag(self
,el
,tag
,skip
=0):
102 if not r
.tag
or r
.tag
.lower() != tag
:
103 while (r
.getprevious() != None) and not (r
.getprevious().tag
and r
.getprevious().tag
.lower() == tag
):
105 if r
.getprevious() != None:
107 if r
.tag
and r
.tag
.lower() == tag
:
111 def ResolveSecondaryId(self
,thread
,Ids
):
116 tagsf
= and_(Post
.tags
.any(tag
=thread
.chanTag
),Post
.tags
.any(tag
=thread
.board
))
117 f2
= and_(Post
.parentid
==-1,tagsf
)
118 f1
= and_(Post
.secondaryIndex
==Ids
[0],f2
)
119 thread
= meta
.Session
.query(Post
).filter(f1
).first()
124 post
= meta
.Session
.query(Post
).filter(and_(Post
.secondaryIndex
==int(Ids
[1]),Post
.parentid
==thread
.id)).first()
131 def GetPostID(self
,post
):
133 ids
= self
.replyIdRe
.findall(post
.href
)
134 return [post
.thread
.tid
,int(ids
[0])]
136 ids
= self
.postIdRe
.findall(post
.href
)
137 return [int(ids
[0][0]),ids
[0][2] and int(ids
[0][2]) or int(ids
[0][0])]
140 def parseLink(self
,link
):
141 s1
= link
.split('://')
142 p
= len(s1
)>1 and s1
[0] or None
143 p2
= p
and (p
+'://') or ''
144 s2
= s1
[-1].split('/')
145 return [p
, s2
[0], p2
+ s2
[0] + '/', p2
+ '/'.join(s2
[:-1]) + '/', s2
[-1],'/'+'/'.join(s2
[1:])]
147 class LoaderLocal(Loader
):
148 def __init__(self
,link
):
149 p
= self
.parseLink(link
)
150 self
.relativeUrl
= p
[3]
153 stats
= os
.stat(link
)
154 return [datetime
.datetime
.fromtimestamp(stats
[8]),stats
[6]]
158 return open(url
,'rb').read()
159 def getAbsolutePath(self
,url
):
160 return self
.relativeUrl
+ url
161 def getFromRelative(self
,url
):
162 return self
.get(self
.getAbsolutePath(url
))
164 class LoaderHTTP(Loader
):
165 def __init__(self
,link
):
166 p
= self
.parseLink(link
)
170 self
.relativeUrl
= p
[3]
172 linkp
= self
.parseLink(link
)
173 c
= httplib
.HTTPConnection(linkp
[1])
174 c
.request('HEAD', linkp
[5])
177 size
= r
.getheader('content-length',0)
178 date
= r
.getheader('last-modified',r
.getheader('date',None))
179 return [DTP
.getDateTime(date
),size
]
180 elif r
.status
== 404:
185 req
= urllib2
.Request(url
)
186 req
.add_header('Referer', self
.baseUrl
)
188 f
= urllib2
.urlopen(req
)
191 except urllib2
.HTTPError
:
193 def getAbsolutePath(self
,url
):
195 return self
.baseUrl
+ url
197 return self
.relativeUrl
+ url
198 def getFromRelative(self
,url
):
199 return self
.get(self
.getAbsolutePath(url
))
201 def filter(self
,post
):
203 class IBFilterSage(IBFilter
):
204 def filter(self
,post
):
206 class IBFilterLowres(IBFilter
):
207 def filter(self
,post
):
208 return post
.pic
and post
.pic
.width
< 50
211 def __init__(self
,entry
,parsers
,directlink
=None,forcetype
=None):
212 self
.parser
= parsers
[entry
.type]
215 self
.board
= entry
.board
216 self
.chanTag
= entry
.chanTag
217 self
.tags
= entry
.tags
and entry
.tags
.split(',') or []
218 self
.type = entry
.type
219 self
.forcetype
= forcetype
220 self
.lastChanged
= entry
.lastChanged
222 filters
= entry
.filters
and entry
.filters
.split(',') or []
225 self
.filters
.append(GFilters
[f
])
227 self
.timeDiff
= entry
.timeDiff
228 self
.directlink
= directlink
229 self
.loader
= Loader()
230 if not self
.directlink
:
231 self
.directlink
= self
.parser
.GetThreadLink(self
.url
,self
.board
,self
.tid
)
232 if self
.loader
.parseLink(self
.directlink
)[0]:
233 self
.loader
= LoaderHTTP(self
.directlink
)
235 self
.loader
= LoaderLocal(self
.directlink
)
236 def checkState(self
):
237 stat
= self
.loader
.stat(self
.directlink
)
240 elif stat
[0] > self
.lastChanged
:
241 return [200,stat
[0],stat
[1]]
243 return [304,stat
[0],stat
[1]]
244 def initialize(self
):
245 page
= self
.loader
.get(self
.directlink
)
247 parser
= etree
.HTMLParser()
248 if isinstance(page
, str):
249 page
= page
.decode('utf-8')
250 self
.document
= etree
.parse(StringIO
.StringIO(page
), parser
)
251 self
.posts
= self
.parser
.GetPostsList(self
)
252 self
.threadId
= self
.parser
.ResolveSecondaryId(self
,[self
.tid
,self
.tid
])
259 def filter(self
,post
):
262 for f
in self
.filters
:
263 fl
= fl
or f
.filter(post
)
265 def ReplaceReference(self
,m
):
268 tid
= self
.parser
.ResolveSecondaryId(self
,[mg
[0],mg
[0]])
271 pid
= self
.parser
.ResolveSecondaryId(self
,[mg
[0],mg
[1]])
275 return '<a href="/%s#i%s" onclick="highlight(%s)">>>%s</a>' % (tid
, pid
, pid
, mg
[1])
276 print "ERROR! %s/%s does not exist!" % (mg
[0],mg
[1])
277 return '<a href="/secondaryIndex/%s#i%s" onclick="highlight(%s)">>>%s</a>' % (mg
[0], mg
[1], mg
[1], mg
[1])
279 class WakabaParser(IBParser
):
280 replyIdRe
= re
.compile(r
""">>(\d+)""")
281 postIdRe
= re
.compile(r
"""\/(\d+)\.x?h?t?ml?(#i?(\d+))?""")
282 referenceRe
= re
.compile("""<a [^>]*href="([^"]*/)?(\d+)\.[^"]+"[^>]*>\>\;\>\;(\d+)</a>""")
283 def GetThreadLink(self
,url
,board
,thread
):
284 return 'http://'+url
+'/'+board
+'/res/'+str(thread
)+'.html'
285 def GetPostsList(self
,thread
):
286 posts
= thread
.document
.xpath("/html/body/form//*[@class='reflink']/a")
292 post
.href
= postA
.get('href')
293 post
.reflink
= postA
.getparent()
294 post
.Ids
= self
.GetPostID(post
)
295 post
.secondaryIndex
= int(post
.Ids
[1])
296 postsList
.append(post
)
300 def GetImgSrc(self
,post
):
301 cont
= post
.l
.getparent()
303 if t
.tag
.lower() == 'a':
305 if href
and href
.find('/src/') != -1:
306 if post
.thread
.forcetype
:
307 return '../src/' + post
.thread
.loader
.parseLink(href
)[4]
312 def ParseText(self
,post
):
313 if post
.bq
is not None:
315 message
= etree
.tostring(post
.bq
, pretty_print
=False,encoding
='utf-8')
316 if message
[:12].lower() == '<blockquote>' and message
[-13:].lower() == '</blockquote>':
317 message
= message
[12:-13]
319 print "Cant parse this message : '%s'" % message
321 message
= self
.referenceRe
.sub(post
.thread
.ReplaceReference
,message
)
325 def parsePost(self
,post
):
326 post
.bq
= self
.GetNextTag(post
.reflink
,'blockquote')
327 post
.l
= self
.GetPreviousTag(post
.reflink
,'label')
328 post
.title
= unicodify(post
.l
[1].text
)
333 if len(post
.cpn
)>0 and post
.cpn
[0].tag
.lower() == 'a':
334 post
.cpnHref
= post
.cpn
[0].get('href')
335 if post
.cpnHref
.find('sage') > -1:
337 post
.src
= self
.GetImgSrc(post
)
338 date
= post
.l
[2].tail
.encode('utf-8')
339 date
= date
.replace("\r",'').replace("\n",'')
340 post
.date
= DTP
.getDateTime(date
)
341 post
.message
= unicodify(self
.ParseText(post
))
343 class UpdateArchive(Command
):
344 # Parser configuration
345 summary
= "--NO SUMMARY--"
346 usage
= "--NO USAGE--"
348 parser
= Command
.standard_parser(verbose
=False)
349 parser
.add_option("--mode")
350 parser
.add_option("--chan")
351 parser
.add_option("--board")
352 parser
.add_option("--thread")
353 parser
.add_option("--chanTag")
354 parser
.add_option("--type")
355 parser
.add_option("--tags")
356 parser
.add_option("--timeDiff")
357 parser
.add_option("--directlink")
358 parser
.add_option("--list")
359 parser
.add_option("--filters")
360 parser
.add_option("--forcetype")
361 parsers
= {'wakaba':WakabaParser()}
363 """Main command to create a new shell"""
365 config_file
= 'development.ini'
366 config_name
= 'config:%s' % config_file
367 here_dir
= os
.getcwd()
368 locs
= dict(__name__
="pylons-admin")
369 conf
= appconfig(config_name
, relative_to
=here_dir
)
370 conf
.update(dict(app_conf
=conf
.local_conf
,global_conf
=conf
.global_conf
))
371 paste
.deploy
.config
.CONFIG
.push_thread_config(conf
)
372 sys
.path
.insert(0, here_dir
)
373 wsgiapp
= loadapp(config_name
, relative_to
=here_dir
)
374 test_app
= paste
.fixture
.TestApp(wsgiapp
)
375 tresponse
= test_app
.get('/_test_vars')
376 request_id
= int(tresponse
.body
)
377 test_app
.pre_request_hook
= lambda self
:paste
.registry
.restorer
.restoration_end()
378 test_app
.post_request_hook
= lambda self
:paste
.registry
.restorer
.restoration_begin(request_id
)
379 paste
.registry
.restorer
.restoration_begin(request_id
)
380 egg_info
= find_egg_info_dir(here_dir
)
381 f
= open(os
.path
.join(egg_info
, 'top_level.txt'))
382 packages
= [l
.strip() for l
in f
.readlines() if l
.strip() and not l
.strip().startswith('#')]
385 for pkg_name
in packages
:
386 # Import all objects from the base module
387 base_module
= pkg_name
+ '.lib.base'
388 found_base
= can_import(base_module
)
391 base_module
= pkg_name
+ '.controllers'
392 found_base
= can_import(base_module
)
398 raise ImportError("Could not import base module. Are you sure this is a Pylons app?")
400 base
= sys
.modules
[base_module
]
401 base_public
= [__name
for __name
in dir(base
) if not \
402 __name
.startswith('_') or __name
== '_']
403 for name
in base_public
:
404 locs
[name
] = getattr(base
, name
)
405 locs
.update(dict(wsgiapp
=wsgiapp
, app
=test_app
))
407 mapper
= tresponse
.config
.get('routes.map')
409 locs
['mapper'] = mapper
412 self
.thread
= self
.options
.thread
413 self
.chan
= self
.options
.chan
414 self
.chanTag
= self
.options
.chanTag
415 self
.board
= self
.options
.board
417 logging
.getLogger('sqlalchemy').setLevel(logging
.ERROR
)
418 GFilters
['sage'] = IBFilterSage()
419 GFilters
['lowres'] = IBFilterLowres()
420 #logging.getLogger( 'sqlalchemy').setLevel( logging.NONE )
421 if not self
.options
.mode
or self
.options
.mode
== 'update':
423 elif self
.options
.mode
== 'add':
425 elif self
.options
.mode
== 'thread':
426 if self
.options
.list:
427 f
= open(self
.options
.list,'r')
428 tList
= f
.readlines()
430 tList
= [self
.options
.thread
]
432 entry
= ArchiveList()
434 entry
.url
= self
.options
.chan
435 entry
.chanTag
= self
.options
.chanTag
436 entry
.board
= self
.options
.board
or 'b'
437 entry
.tags
= self
.options
.tags
or ''
438 entry
.type = self
.options
.type or 'wakaba'
439 entry
.filters
= self
.options
.filters
or ''
440 entry
.timeDiff
= self
.options
.timeDiff
or 0
441 entry
.lastChanged
= datetime
.datetime
.fromtimestamp(0)
442 print "Processing %s %s %s %s" % (entry
.tid
,entry
.url
,entry
.chanTag
,entry
.board
)
443 thread
= Thread(entry
,self
.parsers
,self
.options
.directlink
,self
.options
.forcetype
)
444 self
.processThread(thread
)
446 def LoadPage(self
,thread
,chan
='2ch.ru',board
='b'):
447 self
.host
= 'http://'+chan
449 self
.path
= '/'+board
+'/res/'
450 self
.url
= self
.host
+self
.path
+thread
+'.html'
452 self
.path
= '/'+board
+'/'
453 self
.url
= self
.host
+self
.path
455 req
= urllib2
.Request(self
.url
)
456 req
.add_header('Referer', self
.host
+'/'+board
+'/')
457 f
= urllib2
.urlopen(req
)
461 def getTags(self
,tagsList
):
463 for tagName
in tagsList
:
464 tag
= meta
.Session
.query(Tag
).filter(Tag
.tag
==tagName
).first()
468 tags
.append(Tag(tagName
))
471 def processPost(self
,post
):
472 post
.thread
.parser
.parsePost(post
)
475 post
.pic
= self
.LoadImage(post
)
479 post
.picid
= post
.pic
.id
480 print "Thread %s Post %s (Image:%s %s %sx%s) at %s, sage : %s" % (post
.Ids
[0],post
.Ids
[1],post
.src
,post
.pic
and post
.pic
.id or 0,post
.pic
and post
.pic
.width
or 0,post
.pic
and post
.pic
.height
or 0,post
.date
,post
.sage
)
481 if (post
.thread
.filter(post
)):
483 print "----------------------"
485 if post
.Ids
[0] == post
.Ids
[1]:
488 post
.bumpDate
= post
.date
489 post
.tags
= self
.getTags([post
.thread
.chanTag
,post
.thread
.board
]+post
.thread
.tags
)
490 post
.thread
.post
= post
492 post
.parentid
= post
.thread
.post
.id
494 post
.thread
.post
.bumpDate
= post
.date
495 post
.thread
.post
.replyCount
+= 1
497 meta
.Session
.save(post
)
498 meta
.Session
.commit()
499 idList
[post
.Ids
[1]]=[post
.id,post
.Ids
[0]]
500 print "Saved in DB as %s/%s" % (post
.id,post
.parentid
)
501 print "----------------------"
503 def processThread(self
,thread
):
504 if thread
.initialize():
506 thread
.post
= meta
.Session
.query(Post
).get(thread
.threadId
)
507 lastPost
= meta
.Session
.query(Post
).filter(Post
.parentid
==thread
.post
.id).filter(Post
.secondaryIndex
>0).order_by(Post
.secondaryIndex
.desc()).first()
509 lastId
= lastPost
.secondaryIndex
511 lastId
= int(thread
.tid
)
515 for post
in thread
.posts
:
516 if int(post
.Ids
[1]) > lastId
:
518 print "Skipped %s out of %s posts" % (skipped
,len(thread
.posts
))
520 self
.processPost(post
)
524 print "Skipped %s out of %s posts" % (skipped
,len(thread
.posts
))
527 def LoadImage(self
,post
):
528 url
= post
.thread
.loader
.getAbsolutePath(post
.src
)
529 fileName
= post
.thread
.loader
.parseLink(url
)[4]
530 res
= post
.thread
.loader
.getFromRelative(post
.src
)
532 localFilePath
= os
.path
.join(g
.OPT
.uploadPath
, fileName
)
533 localFile
= open(localFilePath
,'wb')
536 file = FieldStorageLike(fileName
,localFilePath
)
537 fileDescriptors
= self
.processFile(file, 200)
540 pic
= fileDescriptors
[0]
541 fileHolder
= fileDescriptors
[1]
542 if pic
and pic
!= -1 and fileHolder
:
543 fileHolder
.disableDeletion()
548 def processFile(self
, file, thumbSize
=250):
549 if isinstance(file, cgi
.FieldStorage
) or isinstance(file,FieldStorageLike
):
550 # We should check whether we got this file already or not
551 # If we dont have it, we add it
552 name
= str(long(time
.time() * 10**7))
553 ext
= file.filename
.rsplit('.',1)[:0:-1]
556 ext
= ext
[0].lstrip(os
.sep
)
558 # Panic, no extention found
562 # Make sure its something we want to have
564 extParams
= meta
.Session
.query(Extension
).filter(Extension
.ext
==ext
).first()
569 localFilePath
= os
.path
.join(g
.OPT
.uploadPath
, name
+ '.' + ext
)
570 localFile
= open(localFilePath
,'w+b')
571 shutil
.copyfileobj(file.file, localFile
)
573 md5
= hashlib
.md5(localFile
.read()).hexdigest()
577 pic
= meta
.Session
.query(Picture
).filter(Picture
.md5
==md5
).first()
580 os
.unlink(localFilePath
)
584 if extParams
.type == 'image':
585 thumbFilePath
= name
+ 's.' + ext
586 size
= self
.makeThumbnail(localFilePath
, os
.path
.join(g
.OPT
.uploadPath
,thumbFilePath
), (thumbSize
,thumbSize
))
588 if extParams
.type == 'image-jpg':
589 thumbFilePath
= name
+ 's.jpg'
590 size
= self
.makeThumbnail(localFilePath
, os
.path
.join(g
.OPT
.uploadPath
,thumbFilePath
), (thumbSize
,thumbSize
))
592 thumbFilePath
= extParams
.path
593 size
= [0, 0, extParams
.thwidth
, extParams
.thheight
]
595 return [-1, AngryFileHolder(localFilePath
)]
598 pic
.path
= name
+ '.' + ext
599 pic
.thumpath
= thumbFilePath
602 pic
.thwidth
= size
[2]
603 pic
.thheight
= size
[3]
604 pic
.extid
= extParams
.id
605 pic
.size
= os
.stat(localFilePath
)[6]
607 meta
.Session
.save(pic
)
608 meta
.Session
.commit()
609 return [pic
, AngryFileHolder(localFilePath
, pic
)]
613 def makeThumbnail(self
, source
, dest
, maxSize
):
614 sourceImage
= Image
.open(source
)
615 size
= sourceImage
.size
617 sourceImage
.thumbnail(maxSize
,Image
.ANTIALIAS
)
618 sourceImage
.save(dest
)
619 return size
+ sourceImage
.size
622 def AddToArchive(self
):
623 if self
.options
.thread
and self
.options
.chan
and self
.options
.chanTag
:
624 if not self
.options
.board
:
625 self
.options
.board
= 'b'
626 entry
= meta
.Session
.query(ArchiveList
).filter(ArchiveList
.tid
==self
.options
.thread
).filter(ArchiveList
.url
==self
.options
.chan
).filter(ArchiveList
.board
==self
.options
.board
).first()
628 print "Thread is already in the list"
630 entry
= ArchiveList()
631 entry
.tid
= self
.options
.thread
632 entry
.url
= self
.options
.chan
633 entry
.chanTag
= self
.options
.chanTag
634 entry
.board
= self
.options
.board
635 entry
.tags
= self
.options
.tags
or ''
636 entry
.type = self
.options
.type or 'wakaba'
637 entry
.filters
= self
.options
.filters
or ''
638 entry
.timeDiff
= self
.options
.timeDiff
or 0
639 entry
.lastChanged
= datetime
.datetime
.fromtimestamp(0)
640 meta
.Session
.save(entry
)
641 meta
.Session
.commit()
643 print "Bad parameters"
644 def UpdateArchive(self
):
645 archiveList
= meta
.Session
.query(ArchiveList
).all()
646 for entry
in archiveList
:
647 thread
= Thread(entry
,self
.parsers
)
648 state
= thread
.checkState()
649 print "*** Thread %s HTTP %s" % (thread
.directlink
,state
[0])
651 meta
.Session
.delete(entry
)
652 meta
.Session
.commit()
653 elif state
[0] == 200:
654 self
.processThread(thread
)
655 entry
.lastChanged
= state
[1]
656 meta
.Session
.commit()