2 # extractor <prefix> <reportfile> <source1> <source2> ...
3 # must be run in directory of sources
4 # will process each source according to its configuration file
5 # and save new clean text (if available) in same directory
6 # with <prefix> as start of name and <reportfile> for reporting
7 # time of earliest next polling (for job planning) and other information
10 .
$SCRIPTDIR/constants.sh
11 tmpf1
=$TMPDIR/extractor1$$.tmp
12 tmpf2
=$TMPDIR/extractor2$$.tmp
14 minpoll
=${MAXPOLLING:-999999}
17 # only one character for prefix
18 prefix
=`echo $1 | sed -e 's/\(.\).*/\1/'`
21 suffix
=${SOURCEFILESUFFIX:-.txt}
23 logf
=`logfile extractor-$prefix`
24 logit
$logf extractor working
at $starttime:
29 # ignore directories without readable configuration file
30 if test -r $SOURCECONFIG
32 # read status values or set default ones
33 if test -r $SOURCESTATUS
35 index
=`configread $SOURCESTATUS INDEX`
36 lasthash
=`configread $SOURCESTATUS LASTHASH`
37 lastpoll
=`configread $SOURCESTATUS LASTPOLL`
39 # for index, use source configuration or global start value
40 minindex
=`configread $SOURCECONFIG MININDEX`
41 minindex
=${minindex:-$MININDEX}
42 maxindex
=`configread $SOURCECONFIG MAXINDEX`
43 maxindex
=${maxindex:-$MAXINDEX}
44 index
=${index:-$minindex}
45 lasthash
=${lasthash:-nil}
46 lastpoll
=${lastpoll:-0}
48 # get polling interval, set to maximum if unset
49 polling
=`configread $SOURCECONFIG POLLING` || polling
=$MAXPOLLING
50 #echo : lastpoll $lastpoll : pollinterval $polling : now $now
51 # calculate time of next polling
52 polling
=`expr $lastpoll + $polling - $now`
54 if test $polling -le 0
56 logit
$logf $source needs polling
57 # source descriptor (url, file, etc)
58 desc
=`configread $SOURCECONFIG SOURCE` || desc
=file:///dev
/null
59 prio
=`configread $SOURCECONFIG PRIORITY` || prio
=$DEFAULTPRIORITY
60 recipe
=`configread $SOURCECONFIG RECIPE`
61 if test ! -x "$recipe"
63 logit
$logf $source ignoring non-executable recipe
$recipe
66 if $SCRIPTDIR/fetcher
"$desc" > $tmpf1
68 $recipe <$tmpf1 >$tmpf2
69 newhash
=`hashfunction $tmpf2`
70 if test $lasthash != $newhash
72 stname
="$prefix$prio$index$suffix"
81 index
=`expr $index + 1`
82 if test $index -gt $maxindex
85 logit
$logf $source fetched with new
hash $newhash
89 logit
$logf $source has same
hash $newhash
92 logit
$logf $source fetching or filtering failed
94 # re-read polling interval
95 polling
=`configread $SOURCECONFIG POLLING` || polling
=$MAXPOLLING
97 # calculate future minimal polling time
98 if test $minpoll -gt $polling
101 # save current status
102 cat <<EOT >$SOURCESTATUS
107 else logit
$logf $source no configfile found
, ignoring
109 # go back to channel directory
113 # save closest next polling time for controller
114 echo NEXTPOLL
`expr $starttime + $minpoll` > $reportfile