consolidate logging, install hash processing,
[ambros.git] / src / extractor
blobf1359c818df368f139477ca6216abd13b800e930
1 #!/bin/sh
2 # extractor <prefix> <reportfile> <source1> <source2> ...
3 # must be run in directory of sources
4 # will process each source according to its configuration file
5 # and save new clean text (if available) in same directory
6 # with <prefix> as start of name and <reportfile> for reporting
7 # time of earliest next polling (for job planning) and other information
9 # read global settings
10 . $SCRIPTDIR/constants.sh
11 tmpf1=$TMPDIR/extractor1$$.tmp
12 tmpf2=$TMPDIR/extractor2$$.tmp
14 minpoll=${MAXPOLLING:-999999}
15 starttime=`nowsec`
17 # only one character for prefix
18 prefix=`echo $1 | sed -e 's/\(.\).*/\1/'`
19 reportfile=$2
20 shift 2
21 suffix=${SOURCEFILESUFFIX:-.txt}
23 logf=`logfile extractor-$prefix`
24 logit $logf extractor working at $starttime:
26 for source in $*
28 cd $source
29 # ignore directories without readable configuration file
30 if test -r $SOURCECONFIG
31 then
32 # read status values or set default ones
33 if test -r $SOURCESTATUS
34 then
35 index=`configread $SOURCESTATUS INDEX`
36 lasthash=`configread $SOURCESTATUS LASTHASH`
37 lastpoll=`configread $SOURCESTATUS LASTPOLL`
39 # for index, use source configuration or global start value
40 minindex=`configread $SOURCECONFIG MININDEX`
41 minindex=${minindex:-$MININDEX}
42 maxindex=`configread $SOURCECONFIG MAXINDEX`
43 maxindex=${maxindex:-$MAXINDEX}
44 index=${index:-$minindex}
45 lasthash=${lasthash:-nil}
46 lastpoll=${lastpoll:-0}
47 now=`nowsec`
48 # get polling interval, set to maximum if unset
49 polling=`configread $SOURCECONFIG POLLING` || polling=$MAXPOLLING
50 #echo : lastpoll $lastpoll : pollinterval $polling : now $now
51 # calculate time of next polling
52 polling=`expr $lastpoll + $polling - $now`
53 # poll if passed
54 if test $polling -le 0
55 then
56 logit $logf $source needs polling
57 # source descriptor (url, file, etc)
58 desc=`configread $SOURCECONFIG SOURCE` || desc=file:///dev/null
59 prio=`configread $SOURCECONFIG PRIORITY` || prio=$DEFAULTPRIORITY
60 recipe=`configread $SOURCECONFIG RECIPE`
61 if test ! -x "$recipe"
62 then
63 logit $logf $source ignoring non-executable recipe $recipe
64 recipe=/bin/cat
66 if $SCRIPTDIR/fetcher "$desc" > $tmpf1
67 then
68 $recipe <$tmpf1 >$tmpf2
69 newhash=`hashfunction $tmpf2`
70 if test $lasthash != $newhash
71 then
72 stname="$prefix$prio$index$suffix"
73 # PBL
74 cat <<EOH >$stname
75 IDENTIFICATION $desc
76 PRIORITY $prio
77 INDEX $index
78 DECAY $decay
79 EOH
80 cat $tmpf2 >>$stname
81 index=`expr $index + 1`
82 if test $index -gt $maxindex
83 then index=$minindex
85 logit $logf $source fetched with new hash $newhash
86 lasthash=$newhash
87 lastpoll=$now
88 else
89 logit $logf $source has same hash $newhash
91 else
92 logit $logf $source fetching or filtering failed
94 # re-read polling interval
95 polling=`configread $SOURCECONFIG POLLING` || polling=$MAXPOLLING
97 # calculate future minimal polling time
98 if test $minpoll -gt $polling
99 then minpoll=$polling
101 # save current status
102 cat <<EOT >$SOURCESTATUS
103 INDEX $index
104 LASTHASH $lasthash
105 LASTPOLL $lastpoll
107 else logit $logf $source no configfile found, ignoring
109 # go back to channel directory
110 cd ..
111 done
113 # save closest next polling time for controller
114 echo NEXTPOLL `expr $starttime + $minpoll` > $reportfile
116 rm -f $tmpf1 $tmpf2