d02f6eebde190b5548fbc8d4c7ecd7fa11eedc5a
[urlwatch.git] / lib / urlwatch / handler.py
blobd02f6eebde190b5548fbc8d4c7ecd7fa11eedc5a
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2010 Thomas Perl <thp@thpinfo.com>
7 # All rights reserved.
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 try:
33 # Available in Python 2.5 and above and preferred if available
34 import hashlib
35 have_hashlib = True
36 except ImportError:
37 # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
38 # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
39 import sha
40 have_hashlib = False
42 import subprocess
43 import email.Utils
44 import urllib2
45 import os
46 import stat
47 import sys
48 import re
50 class JobBase(object):
51 def __init__(self, location):
52 self.location = location
54 def __str__(self):
55 return self.location
57 def get_guid(self):
58 if have_hashlib:
59 sha_hash = hashlib.new('sha1')
60 sha_hash.update(self.location)
61 return sha_hash.hexdigest()
62 else:
63 return sha.new(self.location).hexdigest()
65 def retrieve(self, timestamp=None, filter=None, headers=None):
66 raise Exception('Not implemented')
68 class ShellJob(JobBase):
69 def retrieve(self, timestamp=None, filter=None, headers=None):
70 process = subprocess.Popen(self.location, \
71 stdout=subprocess.PIPE, \
72 shell=True)
73 stdout_data, stderr_data = process.communicate()
74 return filter(self.location, stdout_data)
76 class UrlJob(JobBase):
77 CHARSET_RE = re.compile('text/(html|plain); charset=(.*)')
79 def retrieve(self, timestamp=None, filter=None, headers=None):
80 headers = dict(headers)
81 if timestamp is not None:
82 timestamp = email.Utils.formatdate(timestamp)
83 headers['If-Modified-Since'] = timestamp
84 request = urllib2.Request(self.location, None, headers)
85 response = urllib2.urlopen(request)
86 headers = response.info()
87 content = response.read()
88 encoding = None
90 # Determine content type via HTTP headers
91 content_type = headers.get('Content-type', '')
92 content_type_match = self.CHARSET_RE.match(content_type)
93 if content_type_match:
94 encoding = content_type_match.group(2)
96 if encoding is not None:
97 # Convert from specified encoding to utf-8
98 content_unicode = content.decode(encoding, 'ignore')
99 content = content_unicode.encode('utf-8')
101 data = filter(self.location, content)
102 return data
105 def parse_urls_txt(urls_txt):
106 jobs = []
108 # Security checks for shell jobs - only execute if the current UID
109 # is the same as the file/directory owner and only owner can write
110 allow_shelljobs = True
111 shelljob_errors = []
112 current_uid = os.getuid()
114 dirname = os.path.dirname(urls_txt)
115 dir_st = os.stat(dirname)
116 if (dir_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0:
117 shelljob_errors.append('%s is group/world-writable' % dirname)
118 allow_shelljobs = False
119 if dir_st.st_uid != current_uid:
120 shelljob_errors.append('%s not owned by %s' % (dirname, os.getlogin()))
121 allow_shelljobs = False
123 file_st = os.stat(urls_txt)
124 if (file_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0:
125 shelljob_errors.append('%s is group/world-writable' % urls_txt)
126 allow_shelljobs = False
127 if file_st.st_uid != current_uid:
128 shelljob_errors.append('%s not owned by %s' % (urls_txt, os.getlogin()))
129 allow_shelljobs = False
131 for line in open(urls_txt).read().splitlines():
132 if line.strip().startswith('#') or line.strip() == '':
133 continue
135 if line.startswith('|'):
136 if allow_shelljobs:
137 jobs.append(ShellJob(line[1:]))
138 else:
139 print >>sys.stderr, '\n SECURITY WARNING - Cannot run shell jobs:\n'
140 for error in shelljob_errors:
141 print >>sys.stderr, ' ', error
142 print >>sys.stderr, '\n Please remove shell jobs or fix these problems.\n'
143 sys.exit(1)
144 else:
145 jobs.append(UrlJob(line))
147 return jobs