extractor now also calculates duration, to speed up later stages
[ambros.git] / src / extractor
blobb76335dad60ebce4f492db1ea3dfb541f8bd7cd0
1 #!/bin/sh
2 # extractor <prefix> <reportfile> <source1> <source2> ...
3 # must be run in directory of sources
4 # will process each source according to its configuration file
5 # and save new clean text (if available) in same directory
6 # with <prefix> as start of name and <reportfile> for reporting
7 # time of earliest next polling (for job planning) and other information
9 # read global settings
10 . $SCRIPTDIR/constants.sh
11 tmpf1=$TMPDIR/extractor1$$.tmp
12 tmpf2=$TMPDIR/extractor2$$.tmp
14 minpoll=${MAXPOLLING:-999999}
15 starttime=`nowsec`
17 # only one character for prefix
18 prefix=`echo $1 | sed -e 's/\(.\).*/\1/'`
19 reportfile=$2
20 shift 2
21 suffix=${SOURCEFILESUFFIX:-.txt}
23 logf=`logfile extractor-$prefix`
24 logit $logf extractor working at $starttime:
26 for source in $*
28 cd $source
29 # ignore directories without readable configuration file
30 if test -r $SOURCECONFIG
31 then
32 # read status values or set default ones
33 if test -r $SOURCESTATUS
34 then
35 index=`configread $SOURCESTATUS INDEX`
36 lasthash=`configread $SOURCESTATUS LASTHASH`
37 lastpoll=`configread $SOURCESTATUS LASTPOLL`
39 # for index, use source configuration or global start value
40 minindex=`configread $SOURCECONFIG MININDEX`
41 minindex=${minindex:-$MININDEX}
42 maxindex=`configread $SOURCECONFIG MAXINDEX`
43 maxindex=${maxindex:-$MAXINDEX}
44 index=${index:-$minindex}
45 lasthash=${lasthash:-nil}
46 lastpoll=${lastpoll:-0}
47 now=`nowsec`
48 # get polling interval, set to maximum if unset
49 polling=`configread $SOURCECONFIG POLLING` || polling=$MAXPOLLING
50 #echo : lastpoll $lastpoll : pollinterval $polling : now $now
51 # calculate time of next polling
52 polling=`expr $lastpoll + $polling - $now`
53 # poll if passed
54 if test $polling -le 0
55 then
56 logit $logf $source needs polling
57 # source descriptor (url, file, etc)
58 desc=`configread $SOURCECONFIG SOURCE` || desc=file:///dev/null
59 prio=`configread $SOURCECONFIG PRIORITY` || prio=$DEFAULTPRIORITY
60 recipe=`configread $SOURCECONFIG RECIPE`
61 if test ! -x "$recipe"
62 then
63 logit $logf $source ignoring non-executable recipe $recipe
64 recipe=/bin/cat
66 if $SCRIPTDIR/fetcher "$desc" > $tmpf1
67 then
68 $recipe <$tmpf1 >$tmpf2
69 newhash=`hashfunction $tmpf2`
70 if test $lasthash != $newhash
71 then
72 duration=`sed -e '1,/^---/d' $tmpf2 | $SCRIPTDIR/morse/sniptime $SPEED`
73 stname="$prefix$prio$index$suffix"
74 # PBL
75 cat <<EOH >$stname
76 IDENTIFICATION $desc
77 PRIORITY $prio
78 INDEX $index
79 DECAY $decay
80 SPEED $SPEED
81 DURATION $duration
82 EOH
83 cat $tmpf2 >>$stname
84 index=`expr $index + 1`
85 if test $index -gt $maxindex
86 then index=$minindex
88 logit $logf $source fetched with new hash $newhash
89 lasthash=$newhash
90 lastpoll=$now
91 else
92 logit $logf $source has same hash $newhash
94 else
95 logit $logf $source fetching or filtering failed
97 # re-read polling interval
98 polling=`configread $SOURCECONFIG POLLING` || polling=$MAXPOLLING
100 # calculate future minimal polling time
101 if test $minpoll -gt $polling
102 then minpoll=$polling
104 # save current status
105 cat <<EOT >$SOURCESTATUS
106 INDEX $index
107 LASTHASH $lasthash
108 LASTPOLL $lastpoll
110 else logit $logf $source no configfile found, ignoring
112 # go back to channel directory
113 cd ..
114 done
116 # save closest next polling time for controller
117 echo NEXTPOLL `expr $starttime + $minpoll` > $reportfile
119 rm -f $tmpf1 $tmpf2