1 package MogileFS
::Worker
::Delete
;
5 use base
'MogileFS::Worker';
6 use MogileFS
::Util
qw(error);
8 # we select 1000 but only do a random 100 of them, to allow
9 # for stateless parallelism
10 use constant LIMIT
=> 1000;
11 use constant PER_BATCH
=> 100;
13 # TODO: use LWP and persistent connections to do deletes. less local ports used.
16 my ($class, $psock) = @_;
17 my $self = fields
::new
($class);
18 $self->SUPER::new
($psock);
23 sub watchdog_timeout
{ 120 }
28 my $sleep_for = 0; # we sleep longer and longer until we hit max_sleep
29 my $sleep_max = 5; # max sleep when there's nothing to do.
31 my $old_queue_check = 0; # next time to check the old queue.
32 my $old_queue_backoff = 0; # backoff index
34 # wait for one pass of the monitor
35 $self->wait_for_monitor;
38 $self->send_to_parent("worker_bored 50 delete");
39 $self->read_from_parent(1);
42 # call our workers, and have them do things
43 # RETVAL = 0; I think I am done working for now
44 # RETVAL = 1; I have more work to do
45 my $tempres = $self->process_tempfiles;
47 if (time() > $old_queue_check) {
48 $self->reenqueue_delayed_deletes;
49 $delres = $self->process_deletes;
50 # if we did no work, crawl the backoff.
52 $old_queue_backoff = 0;
55 $old_queue_check = time() + $old_queue_backoff
56 if $old_queue_backoff > 360;
57 $old_queue_backoff++ unless $old_queue_backoff > 1800;
61 my $delres2 = $self->process_deletes2;
63 # unless someone did some work, let's sleep
64 unless ($tempres || $delres || $delres2) {
65 $sleep_for++ if $sleep_for < $sleep_max;
73 sub process_tempfiles
{
75 # also clean the tempfile table
76 #mysql> select * from tempfile where createtime < unix_timestamp() - 86400 limit 50;
77 #+--------+------------+---------+------+---------+--------+
78 #| fid | createtime | classid | dmid | dkey | devids |
79 #+--------+------------+---------+------+---------+--------+
80 #| 3253 | 1149451058 | 1 | 1 | file574 | 1,2 |
81 #| 4559 | 1149451156 | 1 | 1 | file83 | 1,2 |
82 #| 11024 | 1149451697 | 1 | 1 | file836 | 2,1 |
83 #| 19885 | 1149454542 | 1 | 1 | file531 | 1,2 |
86 # the fids might exist on one of the devices in devids column if we assigned them those,
87 # they wrote some to one of them, then they died or for whatever reason didn't create_close
88 # to use, so we shouldn't delete from tempfile before going on a hunt of the missing fid.
89 # perhaps we should just add to the file_on table for both devids, and let the regular delete
90 # process discover via 404 that they're not there.
92 # select fid, devids from tempfile where createtime < unix_timestamp() - 86400
93 # add file_on rows for both of those,
94 # add fid to fids_to_delete table,
95 # delete from tempfile where fid=?
98 # dig up some temporary files to purge
99 my $sto = Mgd
::get_store
();
100 my $too_old = int($ENV{T_TEMPFILE_TOO_OLD
} || 3600);
101 my $tempfiles = $sto->old_tempfiles($too_old);
102 return 0 unless $tempfiles && @
$tempfiles;
104 # insert the right rows into file_on and file_to_delete and remove the
105 # now expunged (or soon to be) rows from tempfile
106 my (@devfids, @fidids);
107 foreach my $row (@
$tempfiles) {
109 # If FID is still loadable, we've arrived here due to a bug or race
110 # condition elsewhere. Remove the tempfile row but don't delete the
112 my $fidid = $row->[0];
113 my $fid = MogileFS
::FID
->new($fidid);
115 $sto->delete_tempfile_row($fidid);
118 push @fidids, $fidid;
120 # sanity check the string column.
121 my $devids = $row->[1];
122 unless ($devids =~ /^(\d+)(,\d+)*$/) {
126 foreach my $devid (split /,/, $devids) {
127 push @devfids, MogileFS
::DevFID
->new($devid, $row->[0]);
131 # We might've done no work due to discovering the tempfiles are real.
132 return 0 unless @fidids;
134 $sto->mass_insert_file_on(@devfids);
135 $sto->enqueue_fids_to_delete2(@fidids);
136 $sto->dbh->do("DELETE FROM tempfile WHERE fid IN (" . join(',', @fidids) . ")");
140 # new style delete queueing. I'm not putting a lot of effort into commonizing
141 # code between the old one and the new one. Feel free to send a patch!
142 sub process_deletes2
{
145 my $sto = Mgd
::get_store
();
147 my $queue_todo = $self->queue_todo('delete');
148 unless (@
$queue_todo) {
153 while (my $todo = shift @
$queue_todo) {
155 $self->read_from_parent;
157 # load all the devids related to this fid, and delete.
158 my $fid = MogileFS
::FID
->new($todo->{fid
});
159 my $fidid = $fid->id;
160 my @devids = $fid->devids;
161 my %devids = map { $_ => 1 } @devids;
164 for my $devid (@devids) {
165 my $dev = $devid ? MogileFS
::Device
->of_devid($devid) : undef;
166 error
("deleting fid $fidid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG
>= 2;
167 unless ($dev && $dev->exists) {
170 if ($dev->dstate->is_perm_dead) {
171 $sto->remove_fidid_from_devid($fidid, $devid);
172 delete $devids{$devid};
175 # devid is observed down/readonly: delay for at least
177 unless ($dev->observed_writeable) {
178 $sto->reschedule_file_to_delete2_relative($fidid,
179 60 * (10 + $todo->{failcount
}));
182 # devid is marked readonly/down/etc: delay for
184 unless ($dev->can_delete_from) {
185 $sto->reschedule_file_to_delete2_relative($fidid,
186 60 * 60 * (1 + $todo->{failcount
}));
190 my $dfid = MogileFS
::DevFID
->new($dev, $fidid);
191 my $path = $dfid->url;
193 # dormando: "There are cases where url can return undefined,
194 # Mogile appears to try to replicate to bogus devices
197 error
("in deleter, url(devid=$devid, fid=$fidid) returned nothing");
201 my $urlparts = MogileFS
::Util
::url_parts
($path);
203 # hit up the server and delete it
204 # TODO: (optimization) use MogileFS->get_observed_state and don't
205 # try to delete things known to be down/etc
206 my $sock = IO
::Socket
::INET
->new(PeerAddr
=> $urlparts->[0],
207 PeerPort
=> $urlparts->[1],
209 # this used to mark the device as down for the whole tracker.
210 # if the device is actually down, we can struggle until the
211 # monitor job figures it out... otherwise an occasional timeout
212 # due to high load will prevent delete from working at all.
214 $sto->reschedule_file_to_delete2_relative($fidid,
215 60 * 60 * (1 + $todo->{failcount
}));
219 # send delete request
220 error
("Sending delete for $path") if $Mgd::DEBUG
>= 2;
222 $sock->write("DELETE $urlparts->[2] HTTP/1.0\r\n\r\n");
223 my $response = <$sock>;
224 if ($response =~ m!^HTTP/\d+\.\d+\s+(\d+)!) {
225 if (($1 >= 200 && $1 <= 299) || $1 == 404) {
226 # effectively means all went well
227 $sto->remove_fidid_from_devid($fidid, $devid);
228 delete $devids{$devid};
230 # remote file system error? mark node as down
232 error
("Error: unlink failure: $path: HTTP code $httpcode");
234 $sto->reschedule_file_to_delete2_relative($fidid,
235 60 * 30 * (1 + $todo->{failcount
}));
239 error
("Error: unknown response line deleting $path: $response");
244 unless (keys %devids) {
245 $sto->delete_fid_from_file_to_delete2($fidid);
254 sub process_deletes
{
257 my $sto = Mgd
::get_store
();
260 my $delmap = $dbh->selectall_arrayref("SELECT fd.fid, fo.devid ".
261 "FROM file_to_delete fd ".
262 "LEFT JOIN file_on fo ON fd.fid=fo.fid ".
264 my $count = $delmap ?
scalar @
$delmap : 0;
265 return 0 unless $count;
268 foreach my $dm (List
::Util
::shuffle
(@
$delmap)) {
269 last if ++$done > PER_BATCH
;
272 $self->read_from_parent;
273 my ($fid, $devid) = @
$dm;
274 error
("deleting fid $fid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG
>= 2;
276 my $done_with_fid = sub {
278 $dbh->do("DELETE FROM file_to_delete WHERE fid=?", undef, $fid);
279 $sto->condthrow("Failure to delete from file_to_delete for fid=$fid");
282 my $done_with_devid = sub {
284 $dbh->do("DELETE FROM file_on WHERE fid=? AND devid=?",
285 undef, $fid, $devid);
286 $sto->condthrow("Failure to delete from file_on for $fid/$devid");
287 die "Failed to delete from file_on: " . $dbh->errstr if $dbh->err;
290 my $reschedule_fid = sub {
291 my ($secs, $reason) = (int(shift), shift);
292 $sto->insert_ignore("INTO file_to_delete_later (fid, delafter) ".
293 "VALUES (?,".$sto->unix_timestamp."+$secs)", undef,
295 error
("delete of fid $fid rescheduled: $reason") if $Mgd::DEBUG
>= 2;
296 $done_with_fid->("rescheduled");
300 # devid is null: doesn't exist anywhere anymore, we're done with this fid.
301 # devid is observed down/readonly: delay for 10 minutes
302 # devid is marked readonly: delay for 2 hours
303 # devid is marked dead or doesn't exist: consider it deleted on this devid.
305 # CASE: devid is null, which means we're done deleting all instances.
306 unless (defined $devid) {
307 $done_with_fid->("no_more_locations");
311 # CASE: devid is marked dead or doesn't exist: consider it deleted on this devid.
312 # (Note: we're tolerant of '0' as a devid, due to old buggy version which
313 # would sometimes put that in there)
314 my $dev = $devid ? MogileFS
::Device
->of_devid($devid) : undef;
315 unless ($dev && $dev->exists) {
316 $done_with_devid->("devid_doesnt_exist");
319 if ($dev->dstate->is_perm_dead) {
320 $done_with_devid->("devid_marked_dead");
324 # CASE: devid is observed down/readonly: delay for 10 minutes
325 unless ($dev->observed_writeable) {
326 $reschedule_fid->(60 * 10, "not_observed_writeable");
330 # CASE: devid is marked readonly/down/etc: delay for 2 hours
331 unless ($dev->can_delete_from) {
332 $reschedule_fid->(60 * 60 * 2, "devid_marked_not_alive");
336 my $dfid = MogileFS
::DevFID
->new($dev, $fid);
337 my $path = $dfid->url;
339 # dormando: "There are cases where url can return undefined,
340 # Mogile appears to try to replicate to bogus devices
343 error
("in deleter, url(devid=$devid, fid=$fid) returned nothing");
347 my $urlparts = MogileFS
::Util
::url_parts
($path);
349 # hit up the server and delete it
350 # TODO: (optimization) use MogileFS->get_observed_state and don't try to delete things known to be down/etc
351 my $sock = IO
::Socket
::INET
->new(PeerAddr
=> $urlparts->[0],
352 PeerPort
=> $urlparts->[1],
355 # timeout or something, mark this device as down for now and move on
356 $self->broadcast_host_unreachable($dev->hostid);
357 $reschedule_fid->(60 * 60 * 2, "no_sock_to_hostid");
361 # send delete request
362 error
("Sending delete for $path") if $Mgd::DEBUG
>= 2;
364 $sock->write("DELETE $urlparts->[2] HTTP/1.0\r\n\r\n");
365 my $response = <$sock>;
366 if ($response =~ m!^HTTP/\d+\.\d+\s+(\d+)!) {
367 if (($1 >= 200 && $1 <= 299) || $1 == 404) {
368 # effectively means all went well
369 $done_with_devid->("deleted");
371 # remote file system error? mark node as down
373 error
("Error: unlink failure: $path: HTTP code $httpcode");
374 $reschedule_fid->(60 * 30, "http_code_$httpcode");
378 error
("Error: unknown response line deleting $path: $response");
382 # as far as we know, we have more work to do
386 sub reenqueue_delayed_deletes
{
389 my $sto = Mgd
::get_store
();
392 my @fidids = $sto->fids_to_delete_again
395 $sto->enqueue_fids_to_delete(@fidids);
397 $dbh->do("DELETE FROM file_to_delete_later WHERE fid IN (" .
398 join(",", @fidids) . ")");
399 $sto->condthrow("reenqueue file_to_delete_later delete");
407 # indent-tabs-mode: nil