3 from paste
.script
.command
import Command
4 from fc
.lib
.base
import *
6 from sqlalchemy
.orm
import eagerload
7 from sqlalchemy
.orm
import class_mapper
8 from sqlalchemy
.sql
import and_
, or_
, not_
19 from fc
.lib
.fuser
import FUser
20 from fc
.lib
.miscUtils
import *
21 from fc
.lib
.constantValues
import *
22 from fc
.lib
.settings
import *
23 from fc
.lib
.fileHolder
import AngryFileHolder
27 import paste
.deploy
.config
28 from paste
.deploy
import loadapp
, appconfig
29 from paste
.script
.command
import Command
, BadCommand
30 from paste
.script
.filemaker
import FileOp
31 from paste
.script
.pluginlib
import find_egg_info_dir
34 from lxml
import etree
36 from fc
.model
.arch
import *
40 """Attempt to __import__ the specified package/module, returning True when
41 succeeding, otherwise False"""
49 if isinstance(text
, str):
50 text
= text
.decode('utf-8')
57 monthes
= [('Янв','Jan','января'),('Фев','Feb','февраля'),('Мар','Mar','марта'),('Апр','Apr','апреля'),('Май','May','мая'),('Июн','Jun','июня'),('Июл','Jul','июля'),('Авг','Aug','августа'),('Сен','Sep','сентября'),('Окт','Oct','октября'),('Ноя','Nov','ноября'),('Дек','Dec','декабря')]
58 dateRe
= re
.compile(r
"""[^\d]+(\d+)\s+([^\d\s]+)\s+(\d+)\s+(\d+)\:(\d+)\:(\d+)""")
59 dateReISO
= re
.compile(r
"""(\d+)\-(\d+)\-(\d+) (\d+)\:(\d+)\:(\d+)""")
60 def getDateTime(self
,date
):
61 dateP
= self
.dateRe
.findall(date
)
65 for mm
in self
.monthes
:
71 return datetime
.datetime(int(dateP
[2]),mi
,int(dateP
[0]),int(dateP
[3]),int(dateP
[4]),int(dateP
[5]))
74 def getDateTimeFromISO8601(self
,date
):
75 dateP
= self
.dateReISO
.findall(date
)
77 return datetime
.datetime(int(dateP
[0]),int(dateP
[1]),int(dateP
[2]),int(dateP
[3]),int(dateP
[4]),int(dateP
[5]))
79 DTP
= DateTimeParser()
82 def GetNextTag(self
,el
,tag
,skip
=0):
88 if not r
.tag
or r
.tag
.lower() != tag
:
89 while (r
.getnext() != None) and not (r
.getnext().tag
and r
.getnext().tag
.lower() == tag
):
91 if r
.getnext() != None:
93 if r
.tag
and r
.tag
.lower() == tag
:
98 def GetPreviousTag(self
,el
,tag
,skip
=0):
104 if not r
.tag
or r
.tag
.lower() != tag
:
105 while (r
.getprevious() != None) and not (r
.getprevious().tag
and r
.getprevious().tag
.lower() == tag
):
107 if r
.getprevious() != None:
109 if r
.tag
and r
.tag
.lower() == tag
:
113 def ResolveSecondaryId(self
,thread
,Ids
):
118 tagsf
= and_(Post
.tags
.any(tag
=thread
.chanTag
),Post
.tags
.any(tag
=thread
.board
))
119 f2
= and_(Post
.parentid
==-1,tagsf
)
120 f1
= and_(Post
.secondaryIndex
==Ids
[0],f2
)
121 thread
= meta
.Session
.query(Post
).filter(f1
).first()
126 post
= meta
.Session
.query(Post
).filter(and_(Post
.secondaryIndex
==int(Ids
[1]),Post
.parentid
==thread
.id)).first()
133 def GetPostID(self
,post
):
135 ids
= self
.replyIdRe
.findall(post
.href
)
136 return [post
.thread
.tid
,int(ids
[0])]
138 ids
= self
.postIdRe
.findall(post
.href
)
139 return [int(ids
[0][0]),ids
[0][2] and int(ids
[0][2]) or int(ids
[0][0])]
142 def parseLink(self
,link
):
143 s1
= link
.split('://')
144 p
= len(s1
)>1 and s1
[0] or None
145 p2
= p
and (p
+'://') or ''
146 s2
= s1
[-1].split('/')
147 return [p
, s2
[0], p2
+ s2
[0] + '/', p2
+ '/'.join(s2
[:-1]) + '/', s2
[-1],'/'+'/'.join(s2
[1:])]
149 class LoaderLocal(Loader
):
150 def __init__(self
,link
):
151 p
= self
.parseLink(link
)
152 self
.relativeUrl
= p
[3]
155 stats
= os
.stat(link
)
156 return [datetime
.datetime
.fromtimestamp(stats
[8]),stats
[6]]
160 return open(url
,'rb').read()
161 def getAbsolutePath(self
,url
):
162 return self
.relativeUrl
+ url
163 def getFromRelative(self
,url
):
164 return self
.get(self
.getAbsolutePath(url
))
166 class LoaderHTTP(Loader
):
167 def __init__(self
,link
):
168 p
= self
.parseLink(link
)
172 self
.relativeUrl
= p
[3]
174 linkp
= self
.parseLink(link
)
175 c
= httplib
.HTTPConnection(linkp
[1])
176 c
.request('HEAD', linkp
[5])
179 size
= r
.getheader('content-length',0)
180 date
= r
.getheader('last-modified',r
.getheader('date',None))
181 return [DTP
.getDateTime(date
),size
]
182 elif r
.status
== 404:
187 req
= urllib2
.Request(url
)
188 req
.add_header('Referer', self
.baseUrl
)
190 f
= urllib2
.urlopen(req
)
193 except urllib2
.HTTPError
:
195 def getAbsolutePath(self
,url
):
197 return self
.baseUrl
+ url
199 return self
.relativeUrl
+ url
200 def getFromRelative(self
,url
):
201 return self
.get(self
.getAbsolutePath(url
))
203 def filter(self
,post
):
205 class IBFilterSage(IBFilter
):
206 def filter(self
,post
):
208 class IBFilterLowres(IBFilter
):
209 def filter(self
,post
):
210 return post
.pic
and post
.pic
.width
< 50
213 def __init__(self
,entry
,parsers
,directlink
=None,forcetype
=None):
214 self
.parser
= parsers
[entry
.type]
217 self
.board
= entry
.board
218 self
.chanTag
= entry
.chanTag
219 self
.tags
= entry
.tags
and entry
.tags
.split(',') or []
220 self
.type = entry
.type
221 self
.forcetype
= forcetype
222 self
.lastChanged
= entry
.lastChanged
224 filters
= entry
.filters
and entry
.filters
.split(',') or []
227 self
.filters
.append(GFilters
[f
])
229 self
.timeDiff
= entry
.timeDiff
230 self
.directlink
= directlink
231 self
.loader
= Loader()
232 if not self
.directlink
:
233 self
.directlink
= self
.parser
.GetThreadLink(self
.url
,self
.board
,self
.tid
)
234 if self
.loader
.parseLink(self
.directlink
)[0]:
235 self
.loader
= LoaderHTTP(self
.directlink
)
237 self
.loader
= LoaderLocal(self
.directlink
)
238 def checkState(self
):
239 stat
= self
.loader
.stat(self
.directlink
)
242 elif stat
[0] > self
.lastChanged
:
243 return [200,stat
[0],stat
[1]]
245 return [304,stat
[0],stat
[1]]
246 def initialize(self
):
247 page
= self
.loader
.get(self
.directlink
)
249 parser
= etree
.HTMLParser()
250 if isinstance(page
, str):
251 page
= page
.decode('utf-8')
252 self
.document
= etree
.parse(StringIO
.StringIO(page
), parser
)
253 self
.posts
= self
.parser
.GetPostsList(self
)
254 self
.threadId
= self
.parser
.ResolveSecondaryId(self
,[self
.tid
,self
.tid
])
261 def filter(self
,post
):
264 for f
in self
.filters
:
265 fl
= fl
or f
.filter(post
)
267 def ReplaceReference(self
,m
):
270 tid
= self
.parser
.ResolveSecondaryId(self
,[mg
[0],mg
[0]])
273 pid
= self
.parser
.ResolveSecondaryId(self
,[mg
[0],mg
[1]])
277 return '<a href="/%s#i%s" onclick="highlight(%s)">>>%s</a>' % (tid
, pid
, pid
, mg
[1])
278 print "ERROR! %s/%s does not exist!" % (mg
[0],mg
[1])
279 return '<a href="/secondaryIndex/%s#i%s" onclick="highlight(%s)">>>%s</a>' % (mg
[0], mg
[1], mg
[1], mg
[1])
281 class WakabaParser(IBParser
):
282 replyIdRe
= re
.compile(r
""">>(\d+)""")
283 postIdRe
= re
.compile(r
"""\/(\d+)\.x?h?t?ml?(#i?(\d+))?""")
284 referenceRe
= re
.compile("""<a [^>]*href="([^"]*/)?(\d+)\.[^"]+"[^>]*>\>\;\>\;(\d+)</a>""")
285 def GetThreadLink(self
,url
,board
,thread
):
286 return 'http://'+url
+'/'+board
+'/res/'+str(thread
)+'.html'
287 def GetPostsList(self
,thread
):
288 posts
= thread
.document
.xpath("/html/body/form//*[@class='reflink']/a")
294 post
.href
= postA
.get('href')
295 post
.reflink
= postA
.getparent()
296 post
.Ids
= self
.GetPostID(post
)
297 post
.secondaryIndex
= int(post
.Ids
[1])
298 postsList
.append(post
)
302 def GetImgSrc(self
,post
):
303 cont
= post
.l
.getparent()
305 if t
.tag
.lower() == 'a':
307 if href
and href
.find('/src/') != -1:
308 if post
.thread
.forcetype
:
309 return '../src/' + post
.thread
.loader
.parseLink(href
)[4]
314 def ParseText(self
,post
):
315 if post
.bq
is not None:
317 message
= etree
.tostring(post
.bq
, pretty_print
=False,encoding
='utf-8')
318 if message
[:12].lower() == '<blockquote>' and message
[-13:].lower() == '</blockquote>':
319 message
= message
[12:-13]
321 print "Cant parse this message : '%s'" % message
323 message
= self
.referenceRe
.sub(post
.thread
.ReplaceReference
,message
)
327 def parsePost(self
,post
):
328 post
.bq
= self
.GetNextTag(post
.reflink
,'blockquote')
329 post
.l
= self
.GetPreviousTag(post
.reflink
,'label')
330 post
.title
= unicodify(post
.l
[1].text
)
335 if len(post
.cpn
)>0 and post
.cpn
[0].tag
.lower() == 'a':
336 post
.cpnHref
= post
.cpn
[0].get('href')
337 if post
.cpnHref
.find('sage') > -1:
339 post
.src
= self
.GetImgSrc(post
)
340 date
= post
.l
[2].tail
.encode('utf-8')
341 date
= date
.replace("\r",'').replace("\n",'')
342 post
.date
= DTP
.getDateTime(date
)
343 post
.message
= unicodify(self
.ParseText(post
))
345 class UpdateArchive(Command
):
346 # Parser configuration
347 summary
= "--NO SUMMARY--"
348 usage
= "--NO USAGE--"
350 parser
= Command
.standard_parser(verbose
=False)
351 parser
.add_option("--mode")
352 parser
.add_option("--chan")
353 parser
.add_option("--board")
354 parser
.add_option("--thread")
355 parser
.add_option("--chanTag")
356 parser
.add_option("--type")
357 parser
.add_option("--tags")
358 parser
.add_option("--timeDiff")
359 parser
.add_option("--directlink")
360 parser
.add_option("--list")
361 parser
.add_option("--filters")
362 parser
.add_option("--forcetype")
363 parsers
= {'wakaba':WakabaParser()}
365 """Main command to create a new shell"""
367 config_file
= 'development.ini'
368 config_name
= 'config:%s' % config_file
369 here_dir
= os
.getcwd()
370 locs
= dict(__name__
="pylons-admin")
371 conf
= appconfig(config_name
, relative_to
=here_dir
)
372 conf
.update(dict(app_conf
=conf
.local_conf
,global_conf
=conf
.global_conf
))
373 paste
.deploy
.config
.CONFIG
.push_thread_config(conf
)
374 sys
.path
.insert(0, here_dir
)
375 wsgiapp
= loadapp(config_name
, relative_to
=here_dir
)
376 test_app
= paste
.fixture
.TestApp(wsgiapp
)
377 tresponse
= test_app
.get('/_test_vars')
378 request_id
= int(tresponse
.body
)
379 test_app
.pre_request_hook
= lambda self
:paste
.registry
.restorer
.restoration_end()
380 test_app
.post_request_hook
= lambda self
:paste
.registry
.restorer
.restoration_begin(request_id
)
381 paste
.registry
.restorer
.restoration_begin(request_id
)
382 egg_info
= find_egg_info_dir(here_dir
)
383 f
= open(os
.path
.join(egg_info
, 'top_level.txt'))
384 packages
= [l
.strip() for l
in f
.readlines() if l
.strip() and not l
.strip().startswith('#')]
387 for pkg_name
in packages
:
388 # Import all objects from the base module
389 base_module
= pkg_name
+ '.lib.base'
390 found_base
= can_import(base_module
)
393 base_module
= pkg_name
+ '.controllers'
394 found_base
= can_import(base_module
)
400 raise ImportError("Could not import base module. Are you sure this is a Pylons app?")
402 base
= sys
.modules
[base_module
]
403 base_public
= [__name
for __name
in dir(base
) if not \
404 __name
.startswith('_') or __name
== '_']
405 for name
in base_public
:
406 locs
[name
] = getattr(base
, name
)
407 locs
.update(dict(wsgiapp
=wsgiapp
, app
=test_app
))
409 mapper
= tresponse
.config
.get('routes.map')
411 locs
['mapper'] = mapper
414 self
.thread
= self
.options
.thread
415 self
.chan
= self
.options
.chan
416 self
.chanTag
= self
.options
.chanTag
417 self
.board
= self
.options
.board
419 logging
.getLogger('sqlalchemy').setLevel(logging
.ERROR
)
420 GFilters
['sage'] = IBFilterSage()
421 GFilters
['lowres'] = IBFilterLowres()
422 #logging.getLogger( 'sqlalchemy').setLevel( logging.NONE )
423 if not self
.options
.mode
or self
.options
.mode
== 'update':
425 elif self
.options
.mode
== 'add':
427 elif self
.options
.mode
== 'thread':
428 if self
.options
.list:
429 f
= open(self
.options
.list,'r')
430 tList
= f
.readlines()
432 tList
= [self
.options
.thread
]
434 entry
= ArchiveList()
436 entry
.url
= self
.options
.chan
437 entry
.chanTag
= self
.options
.chanTag
438 entry
.board
= self
.options
.board
or 'b'
439 entry
.tags
= self
.options
.tags
or ''
440 entry
.type = self
.options
.type or 'wakaba'
441 entry
.filters
= self
.options
.filters
or ''
442 entry
.timeDiff
= self
.options
.timeDiff
or 0
443 entry
.lastChanged
= datetime
.datetime
.fromtimestamp(0)
444 print "Processing %s %s %s %s" % (entry
.tid
,entry
.url
,entry
.chanTag
,entry
.board
)
445 thread
= Thread(entry
,self
.parsers
,self
.options
.directlink
,self
.options
.forcetype
)
446 self
.processThread(thread
)
448 def LoadPage(self
,thread
,chan
='2ch.ru',board
='b'):
449 self
.host
= 'http://'+chan
451 self
.path
= '/'+board
+'/res/'
452 self
.url
= self
.host
+self
.path
+thread
+'.html'
454 self
.path
= '/'+board
+'/'
455 self
.url
= self
.host
+self
.path
457 req
= urllib2
.Request(self
.url
)
458 req
.add_header('Referer', self
.host
+'/'+board
+'/')
459 f
= urllib2
.urlopen(req
)
463 def getTags(self
,tagsList
):
465 for tagName
in tagsList
:
466 tag
= meta
.Session
.query(Tag
).filter(Tag
.tag
==tagName
).first()
470 tags
.append(Tag(tagName
))
473 def processPost(self
,post
):
474 post
.thread
.parser
.parsePost(post
)
477 post
.pic
= self
.LoadImage(post
)
481 post
.picid
= post
.pic
.id
482 print "Thread %s Post %s (Image:%s %s %sx%s) at %s, sage : %s" % (post
.Ids
[0],post
.Ids
[1],post
.src
,post
.pic
and post
.pic
.id or 0,post
.pic
and post
.pic
.width
or 0,post
.pic
and post
.pic
.height
or 0,post
.date
,post
.sage
)
483 if (post
.thread
.filter(post
)):
485 print "----------------------"
487 if post
.Ids
[0] == post
.Ids
[1]:
490 post
.bumpDate
= post
.date
491 post
.tags
= self
.getTags([post
.thread
.chanTag
,post
.thread
.board
]+post
.thread
.tags
)
492 post
.thread
.post
= post
494 post
.parentid
= post
.thread
.post
.id
496 post
.thread
.post
.bumpDate
= post
.date
497 post
.thread
.post
.replyCount
+= 1
499 meta
.Session
.save(post
)
500 meta
.Session
.commit()
501 idList
[post
.Ids
[1]]=[post
.id,post
.Ids
[0]]
502 print "Saved in DB as %s/%s" % (post
.id,post
.parentid
)
503 print "----------------------"
505 def processThread(self
,thread
):
506 if thread
.initialize():
508 thread
.post
= meta
.Session
.query(Post
).get(thread
.threadId
)
509 lastPost
= meta
.Session
.query(Post
).filter(Post
.parentid
==thread
.post
.id).filter(Post
.secondaryIndex
>0).order_by(Post
.secondaryIndex
.desc()).first()
511 lastId
= lastPost
.secondaryIndex
513 lastId
= int(thread
.tid
)
517 for post
in thread
.posts
:
518 if int(post
.Ids
[1]) > lastId
:
520 print "Skipped %s out of %s posts" % (skipped
,len(thread
.posts
))
522 self
.processPost(post
)
526 print "Skipped %s out of %s posts" % (skipped
,len(thread
.posts
))
529 def LoadImage(self
,post
):
530 url
= post
.thread
.loader
.getAbsolutePath(post
.src
)
531 fileName
= post
.thread
.loader
.parseLink(url
)[4]
532 res
= post
.thread
.loader
.getFromRelative(post
.src
)
534 localFilePath
= os
.path
.join(g
.OPT
.uploadPath
, fileName
)
535 localFile
= open(localFilePath
,'wb')
538 file = FieldStorageLike(fileName
,localFilePath
)
539 fileDescriptors
= self
.processFile(file, 200)
542 pic
= fileDescriptors
[0]
543 fileHolder
= fileDescriptors
[1]
544 if pic
and pic
!= -1 and fileHolder
:
545 fileHolder
.disableDeletion()
550 def processFile(self
, file, thumbSize
=250):
551 if isinstance(file, cgi
.FieldStorage
) or isinstance(file,FieldStorageLike
):
552 # We should check whether we got this file already or not
553 # If we dont have it, we add it
554 name
= str(long(time
.time() * 10**7))
555 ext
= file.filename
.rsplit('.',1)[:0:-1]
558 ext
= ext
[0].lstrip(os
.sep
)
560 # Panic, no extention found
564 # Make sure its something we want to have
566 extParams
= meta
.Session
.query(Extension
).filter(Extension
.ext
==ext
).first()
571 localFilePath
= os
.path
.join(g
.OPT
.uploadPath
, name
+ '.' + ext
)
572 localFile
= open(localFilePath
,'w+b')
573 shutil
.copyfileobj(file.file, localFile
)
575 md5
= hashlib
.md5(localFile
.read()).hexdigest()
579 pic
= meta
.Session
.query(Picture
).filter(Picture
.md5
==md5
).first()
582 os
.unlink(localFilePath
)
586 if extParams
.type == 'image':
587 thumbFilePath
= name
+ 's.' + ext
588 size
= self
.makeThumbnail(localFilePath
, os
.path
.join(g
.OPT
.uploadPath
,thumbFilePath
), (thumbSize
,thumbSize
))
590 if extParams
.type == 'image-jpg':
591 thumbFilePath
= name
+ 's.jpg'
592 size
= self
.makeThumbnail(localFilePath
, os
.path
.join(g
.OPT
.uploadPath
,thumbFilePath
), (thumbSize
,thumbSize
))
594 thumbFilePath
= extParams
.path
595 size
= [0, 0, extParams
.thwidth
, extParams
.thheight
]
597 return [-1, AngryFileHolder(localFilePath
)]
600 pic
.path
= name
+ '.' + ext
601 pic
.thumpath
= thumbFilePath
604 pic
.thwidth
= size
[2]
605 pic
.thheight
= size
[3]
606 pic
.extid
= extParams
.id
607 pic
.size
= os
.stat(localFilePath
)[6]
609 meta
.Session
.save(pic
)
610 meta
.Session
.commit()
611 return [pic
, AngryFileHolder(localFilePath
, pic
)]
615 def makeThumbnail(self
, source
, dest
, maxSize
):
616 sourceImage
= Image
.open(source
)
617 size
= sourceImage
.size
619 sourceImage
.thumbnail(maxSize
,Image
.ANTIALIAS
)
620 sourceImage
.save(dest
)
621 return size
+ sourceImage
.size
624 def AddToArchive(self
):
625 if self
.options
.thread
and self
.options
.chan
and self
.options
.chanTag
:
626 if not self
.options
.board
:
627 self
.options
.board
= 'b'
628 entry
= meta
.Session
.query(ArchiveList
).filter(ArchiveList
.tid
==self
.options
.thread
).filter(ArchiveList
.url
==self
.options
.chan
).filter(ArchiveList
.board
==self
.options
.board
).first()
630 print "Thread is already in the list"
632 entry
= ArchiveList()
633 entry
.tid
= self
.options
.thread
634 entry
.url
= self
.options
.chan
635 entry
.chanTag
= self
.options
.chanTag
636 entry
.board
= self
.options
.board
637 entry
.tags
= self
.options
.tags
or ''
638 entry
.type = self
.options
.type or 'wakaba'
639 entry
.filters
= self
.options
.filters
or ''
640 entry
.timeDiff
= self
.options
.timeDiff
or 0
641 entry
.lastChanged
= datetime
.datetime
.fromtimestamp(0)
642 meta
.Session
.save(entry
)
643 meta
.Session
.commit()
645 print "Bad parameters"
646 def UpdateArchive(self
):
647 archiveList
= meta
.Session
.query(ArchiveList
).all()
648 for entry
in archiveList
:
649 thread
= Thread(entry
,self
.parsers
)
650 state
= thread
.checkState()
651 print "*** Thread %s HTTP %s" % (thread
.directlink
,state
[0])
653 meta
.Session
.delete(entry
)
654 meta
.Session
.commit()
655 elif state
[0] == 200:
656 self
.processThread(thread
)
657 entry
.lastChanged
= state
[1]
658 meta
.Session
.commit()