2 # urlwatch is a minimalistic URL watcher written in Python
3 # Started: 2008-03-04 Thomas Perl <thpinfo.com>
5 """Watch web pages and arbitrary URLs for changes"""
7 __author__
= 'Thomas Perl <thpinfo.com>'
8 __copyright__
= 'Copyright 2008 Thomas Perl'
10 __homepage__
= 'http://thpinfo.com/2008/urlwatch/'
13 user_agent
= 'urlwatch/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % __version__
16 # Configuration section
17 display_errors
= False
21 # File and folder paths
25 urlwatch_dir
= os
.path
.expanduser(os
.path
.join('~', '.urlwatch'))
26 urls_txt
= os
.path
.join(urlwatch_dir
, 'urls.txt')
27 cache_dir
= os
.path
.join(urlwatch_dir
, 'cache')
28 scripts_dir
= os
.path
.join(urlwatch_dir
, 'lib')
29 hooks_py
= os
.path
.join(scripts_dir
, 'hooks.py')
31 # Check if we are installed in the system already
32 (prefix
, bindir
) = os
.path
.split(os
.path
.dirname(os
.path
.abspath(sys
.argv
[0])))
35 # Assume we are installed in system
36 examples_dir
= os
.path
.join(prefix
, 'share', 'urlwatch', 'examples')
38 # Assume we are not yet installed
39 examples_dir
= os
.path
.join(prefix
, bindir
, 'examples')
41 urls_txt_example
= os
.path
.join(examples_dir
, 'urls.txt.example')
51 def foutput(type, url
, content
=None, summary
=None, c
='*', n
=line_length
):
52 """Format output messages
54 Returns a snippet of a specific message type (i.e. 'changed') for
55 a specific URL and an optional (possibly multi-line) content.
57 The parameter "summary" (if specified) should be a list variable
58 that gets one item appended for the summary of the changes.
60 The return value is a list of strings (one item per line).
62 summary_txt
= ': '.join((type.upper(), url
))
64 if summary
is not None:
66 summary
.append(summary_txt
)
68 summary
.append('%s (%d bytes)' % (summary_txt
, len(content
)))
70 result
= [c
*n
, summary_txt
]
71 if content
is not None:
72 result
+= [c
*n
, content
]
73 result
+= [c
*n
, '', '']
78 if __name__
== '__main__':
79 start
= datetime
.datetime
.now()
81 # Created all needed folders
82 for needed_dir
in (urlwatch_dir
, cache_dir
, scripts_dir
):
83 if not os
.path
.isdir(needed_dir
):
84 os
.makedirs(needed_dir
)
86 # Check for required files
87 if not os
.path
.isfile(urls_txt
):
88 example_fn
= os
.path
.join(os
.path
.dirname(urls_txt
), os
.path
.basename(urls_txt_example
))
89 print 'Error: You need to create a urls.txt file first.'
91 print 'Place it in %s' % (urls_txt
)
92 print 'An example is available in %s' % (example_fn
)
94 if os
.path
.exists(urls_txt_example
) and not os
.path
.exists(example_fn
):
95 shutil
.copy(urls_txt_example
, example_fn
)
99 'User-agent': user_agent
,
106 if os
.path
.exists(hooks_py
):
107 hooks
= imp
.load_source('hooks', hooks_py
)
108 if hasattr(hooks
, 'filter'):
109 filter = hooks
.filter
111 print 'WARNING: %s has no filter function - ignoring' % hooks_py
112 filter = lambda x
, y
: y
114 filter = lambda x
, y
: y
116 for url
in (x
for x
in open(urls_txt
).read().splitlines() if not (x
.startswith('#') or x
.strip()=='')):
117 filename
= os
.path
.join(cache_dir
, sha
.new(url
).hexdigest())
119 request
= urllib2
.Request(url
, None, headers
)
120 data
= filter(url
, urllib2
.urlopen(request
).read())
121 if os
.path
.exists(filename
):
122 old_data
= open(filename
).read()
123 diff
= ''.join(difflib
.unified_diff(old_data
.splitlines(1), data
.splitlines(1)))
125 details
+= foutput('changed', url
, diff
, summary
)
127 details
+= foutput('new', url
, None, summary
)
128 open(filename
, 'w').write(data
)
129 except urllib2
.HTTPError
, error
:
131 details
+= foutput('error', url
, error
, summary
)
132 except urllib2
.URLError
, error
:
134 details
+= foutput('error', url
, error
, summary
)
137 end
= datetime
.datetime
.now()
141 print '-'*line_length
142 print 'summary: %d changes' % (len(summary
),)
144 for id, line
in enumerate(summary
):
145 print '%02d. %s' % (id+1, line
)
146 print '-'*line_length
149 print '\n'.join(details
)
151 print 'urlwatch %s, %s' % (__version__
, __copyright__
)
152 print 'Website: %s' % (__homepage__
,)
153 print 'watched %d URLs in %d seconds\n' % (count
, (end
-start
).seconds
)