3 # urlwatch is a minimalistic URL watcher written in Python
5 # Copyright (c) 2008 Thomas Perl <thp|thpinfo.com>
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
11 # 1. Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # 2. Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # 3. The name of the author may not be used to endorse or promote products
17 # derived from this software without specific prior written permission.
19 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 """Watch web pages and arbitrary URLs for changes"""
35 __author__
= 'Thomas Perl <thpinfo.com>'
36 __copyright__
= 'Copyright 2008 Thomas Perl'
38 __homepage__
= 'http://thpinfo.com/2008/urlwatch/'
41 user_agent
= '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname
, __version__
)
43 # Configuration section
44 display_errors
= False
48 # File and folder paths
52 urlwatch_dir
= os
.path
.expanduser(os
.path
.join('~', '.'+pkgname
))
53 urls_txt
= os
.path
.join(urlwatch_dir
, 'urls.txt')
54 cache_dir
= os
.path
.join(urlwatch_dir
, 'cache')
55 scripts_dir
= os
.path
.join(urlwatch_dir
, 'lib')
56 hooks_py
= os
.path
.join(scripts_dir
, 'hooks.py')
58 # Check if we are installed in the system already
59 (prefix
, bindir
) = os
.path
.split(os
.path
.dirname(os
.path
.abspath(sys
.argv
[0])))
62 # Assume we are installed in system
63 examples_dir
= os
.path
.join(prefix
, 'share', pkgname
, 'examples')
65 # Assume we are not yet installed
66 examples_dir
= os
.path
.join(prefix
, bindir
, 'examples')
67 sys
.path
.append(os
.path
.join(prefix
, bindir
, 'lib'))
69 urls_txt_example
= os
.path
.join(examples_dir
, 'urls.txt.example')
70 hooks_py_example
= os
.path
.join(examples_dir
, 'hooks.py.example')
80 def foutput(type, url
, content
=None, summary
=None, c
='*', n
=line_length
):
81 """Format output messages
83 Returns a snippet of a specific message type (i.e. 'changed') for
84 a specific URL and an optional (possibly multi-line) content.
86 The parameter "summary" (if specified) should be a list variable
87 that gets one item appended for the summary of the changes.
89 The return value is a list of strings (one item per line).
91 summary_txt
= ': '.join((type.upper(), url
))
93 if summary
is not None:
95 summary
.append(summary_txt
)
97 summary
.append('%s (%d bytes)' % (summary_txt
, len(content
)))
99 result
= [c
*n
, summary_txt
]
100 if content
is not None:
101 result
+= [c
*n
, content
]
102 result
+= [c
*n
, '', '']
107 if __name__
== '__main__':
108 start
= datetime
.datetime
.now()
110 # Created all needed folders
111 for needed_dir
in (urlwatch_dir
, cache_dir
, scripts_dir
):
112 if not os
.path
.isdir(needed_dir
):
113 os
.makedirs(needed_dir
)
115 # Check for required files
116 if not os
.path
.isfile(urls_txt
):
117 urls_txt_fn
= os
.path
.join(os
.path
.dirname(urls_txt
), os
.path
.basename(urls_txt_example
))
118 hooks_py_fn
= os
.path
.join(os
.path
.dirname(hooks_py
), os
.path
.basename(hooks_py_example
))
119 print 'Error: You need to create a urls.txt file first.'
121 print 'Place it in %s' % (urls_txt
)
122 print 'An example is available in %s' % (urls_txt_fn
)
124 print 'You can also create %s' % (hooks_py
)
125 print 'An example is available in %s' % (hooks_py_fn
)
127 if os
.path
.exists(urls_txt_example
) and not os
.path
.exists(urls_txt_fn
):
128 shutil
.copy(urls_txt_example
, urls_txt_fn
)
129 if os
.path
.exists(hooks_py_example
) and not os
.path
.exists(hooks_py_fn
):
130 shutil
.copy(hooks_py_example
, hooks_py_fn
)
134 'User-agent': user_agent
,
141 if os
.path
.exists(hooks_py
):
142 hooks
= imp
.load_source('hooks', hooks_py
)
143 if hasattr(hooks
, 'filter'):
144 filter = hooks
.filter
146 print 'WARNING: %s has no filter function - ignoring' % hooks_py
147 filter = lambda x
, y
: y
149 filter = lambda x
, y
: y
151 for url
in (x
for x
in open(urls_txt
).read().splitlines() if not (x
.startswith('#') or x
.strip()=='')):
152 filename
= os
.path
.join(cache_dir
, sha
.new(url
).hexdigest())
154 request
= urllib2
.Request(url
, None, headers
)
155 data
= filter(url
, urllib2
.urlopen(request
).read())
156 if os
.path
.exists(filename
):
157 old_data
= open(filename
).read()
158 diff
= ''.join(difflib
.unified_diff(old_data
.splitlines(1), data
.splitlines(1)))
160 details
+= foutput('changed', url
, diff
, summary
)
162 details
+= foutput('new', url
, None, summary
)
163 open(filename
, 'w').write(data
)
164 except urllib2
.HTTPError
, error
:
166 details
+= foutput('error', url
, error
, summary
)
167 except urllib2
.URLError
, error
:
169 details
+= foutput('error', url
, error
, summary
)
172 end
= datetime
.datetime
.now()
176 print '-'*line_length
177 print 'summary: %d changes' % (len(summary
),)
179 for id, line
in enumerate(summary
):
180 print '%02d. %s' % (id+1, line
)
181 print '-'*line_length
184 print '\n'.join(details
)
186 print '%s %s, %s' % (pkgname
, __version__
, __copyright__
)
187 print 'Website: %s' % (__homepage__
,)
188 print 'watched %d URLs in %d seconds\n' % (count
, (end
-start
).seconds
)