2 * Copyright (c) 2005 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/usr.sbin/dntpd/client.c,v 1.13 2007/06/26 02:40:20 dillon Exp $
39 static int client_insane(struct server_info
**, int, server_info_t
);
47 client_main(struct server_info
**info_ary
, int count
)
49 struct server_info
*best_off
;
50 struct server_info
*best_freq
;
54 int calc_offset_correction
;
63 * Subtract the interval from poll_sleep and poll the client
66 * Because we do not compensate for offset corrections which are
67 * in progress, we cannot accumulate data for an offset correction
68 * while a prior correction is still being worked through by the
71 calc_offset_correction
= !sysntp_offset_correction_is_running();
72 for (i
= 0; i
< count
; ++i
)
73 client_poll(info_ary
[i
], min_sleep_opt
, calc_offset_correction
);
76 * Find the best client (or synthesize one). A different client
77 * can be chosen for frequency and offset. Note in particular
78 * that offset counters and averaging code gets reset when an
79 * offset correction is made (otherwise the averaging history will
80 * cause later corrections to overshoot).
82 * The regression used to calculate the frequency is a much
83 * longer-term entity and is NOT reset, so it is still possible
84 * for the offset correction code to make minor adjustments to
85 * the frequency if it so desires.
87 * client_check may replace the server_info pointer with a new
92 for (i
= 0; i
< count
; ++i
)
93 client_check(&info_ary
[i
], &best_off
, &best_freq
);
96 * Check for server insanity. In large NNTP pools some servers
97 * may just be dead wrong, but report that they are right.
100 insane
= client_insane(info_ary
, count
, best_off
);
103 * best_off meets the quorum requirements and is good
106 best_off
->server_insane
= 0;
107 } else if (insane
== 0) {
109 * best_off is probably good, but we do not have enough
110 * servers reporting yet to meet the quorum requirements.
115 * best_off is ugly, mark the server as being insane for
118 best_off
->server_insane
= 60 * 60;
119 logdebuginfo(best_off
, 1,
120 "excessive offset deviation, mapping out\n");
129 offset
= best_off
->lin_sumoffset
/ best_off
->lin_countoffset
;
130 lin_resetalloffsets(info_ary
, count
);
131 if (offset
< -COURSE_OFFSET_CORRECTION_LIMIT
||
132 offset
> COURSE_OFFSET_CORRECTION_LIMIT
||
135 freq
= sysntp_correct_course_offset(offset
);
138 freq
= sysntp_correct_offset(offset
);
145 * Frequency correction (throw away minor freq adjusts from the
146 * offset code if we can't do a frequency correction here). Do
147 * not reissue if it hasn't changed from the last issued correction.
150 freq
+= best_freq
->lin_cache_freq
;
151 if (last_freq
!= freq
) {
152 sysntp_correct_freq(freq
);
158 * This function is responsible for managing the polling mode and
159 * figures out how long we should sleep.
162 for (i
= 0; i
< count
; ++i
)
163 client_manage_polling_mode(info_ary
[i
], &didreconnect
);
165 client_check_duplicate_ips(info_ary
, count
);
168 * Polling loop sleep.
170 usleep(min_sleep_opt
* 1000000 + random() % 500000);
175 client_poll(server_info_t info
, int poll_interval
, int calc_offset_correction
)
183 * Adjust the insane-server countdown
185 if (info
->server_insane
> poll_interval
)
186 info
->server_insane
-= poll_interval
;
188 info
->server_insane
= 0;
191 * By default we always poll. If the polling interval comes under
192 * active management the poll_sleep will be non-zero.
194 if (info
->poll_sleep
> poll_interval
) {
195 info
->poll_sleep
-= poll_interval
;
198 info
->poll_sleep
= 0;
201 * If the client isn't open don't mess with the poll_failed count
202 * or anything else. We are left in the init or startup phase.
205 if (info
->poll_failed
< 0x7FFFFFFF)
210 logdebuginfo(info
, 4, "poll, ");
211 if (udp_ntptimereq(info
->fd
, &rtv
, <v
, &lbtv
) < 0) {
213 logdebug(4, "no response (%d failures in a row)\n", info
->poll_failed
);
214 if (info
->poll_failed
== POLL_FAIL_RESET
) {
215 if (info
->lin_count
!= 0) {
216 logdebuginfo(info
, 4, "resetting regression due to failures\n");
224 * Successful query. Update polling info for the polling mode manager.
227 info
->poll_failed
= 0;
230 * Figure out the offset (the difference between the reported
231 * time and our current time) for linear regression purposes.
233 offset
= tv_delta_double(&rtv
, <v
);
239 if (debug_level
>= 4) {
246 strftime(buf
, sizeof(buf
), "%d-%b-%Y %H:%M:%S", tp
);
247 logdebug(4, "%s.%03ld ", buf
, rtv
.tv_usec
/ 1000);
249 lin_regress(info
, <v
, &lbtv
, offset
, calc_offset_correction
);
250 info
= info
->altinfo
;
251 if (info
&& debug_level
>= 4) {
252 logdebug(4, "%*.*s: poll, ",
253 (int)strlen(info
->target
),
254 (int)strlen(info
->target
), "(alt)");
260 * Find the best client (or synthesize a fake info structure to return).
261 * We can find separate best clients for offset and frequency.
264 client_check(struct server_info
**checkp
,
265 struct server_info
**best_off
,
266 struct server_info
**best_freq
)
268 struct server_info
*check
= *checkp
;
269 struct server_info
*info
;
272 * Start an alternate linear regression once our current one
273 * has passed a certain point.
275 if (check
->lin_count
>= LIN_RESTART
/ 2 && check
->altinfo
== NULL
) {
276 info
= malloc(sizeof(*info
));
277 assert(info
!= NULL
);
278 /* note: check->altinfo is NULL as of the bcopy */
279 bcopy(check
, info
, sizeof(*info
));
280 check
->altinfo
= info
;
285 * Replace our current linear regression with the alternate once
286 * the current one has hit its limit (beyond a certain point the
287 * linear regression starts to work against us, preventing us from
288 * reacting to changing conditions).
290 * Report any significant change in the offset or ppm.
292 if (check
->lin_count
>= LIN_RESTART
) {
293 if ((info
= check
->altinfo
) && info
->lin_count
>= LIN_RESTART
/ 2) {
296 freq_diff
= info
->lin_cache_freq
- check
->lin_cache_freq
;
297 logdebuginfo(info
, 4, "Switching to alternate, Frequence "
298 "difference is %6.3f ppm\n",
307 * BEST CLIENT FOR FREQUENCY CORRECTION:
309 * 8 samples and a correlation > 0.99, or
310 * 16 samples and a correlation > 0.96
313 if ((check
->lin_count
>= 8 && fabs(check
->lin_cache_corr
) >= 0.99) ||
314 (check
->lin_count
>= 16 && fabs(check
->lin_cache_corr
) >= 0.96)
317 fabs(check
->lin_cache_corr
) > fabs(info
->lin_cache_corr
)
326 * BEST CLIENT FOR OFFSET CORRECTION:
328 * Use the standard-deviation and require at least 4 samples. An
329 * offset correction is valid if the standard deviation is less then
330 * the average offset divided by 4.
332 * Servers marked as being insane are not allowed
335 if (check
->lin_countoffset
>= 4 &&
336 (check
->lin_cache_stddev
<
337 fabs(check
->lin_sumoffset
/ check
->lin_countoffset
/ 4)) &&
338 check
->server_insane
== 0
341 fabs(check
->lin_cache_stddev
) < fabs(info
->lin_cache_stddev
)
350 * Actively manage the polling interval. Note that the poll_* fields are
351 * always transfered to the alternate regression when the check code replaces
352 * the current regression with a new one.
354 * This routine is called from the main loop for each base info structure.
355 * The polling mode applies to all alternates so we do not have to iterate
359 client_manage_polling_mode(struct server_info
*info
, int *didreconnect
)
362 * Permanently failed servers are ignored.
364 if (info
->server_state
== -2)
368 * Our polling interval has not yet passed.
370 if (info
->poll_sleep
)
374 * Standard polling mode progression
376 switch(info
->poll_mode
) {
379 * Initial state after connect or when a reconnect is required.
382 logdebuginfo(info
, 2, "polling mode INIT, relookup & reconnect\n");
383 reconnect_server(info
);
386 if (info
->poll_failed
>= POLL_RECOVERY_RESTART
* 5)
387 info
->poll_sleep
= max_sleep_opt
;
388 else if (info
->poll_failed
>= POLL_RECOVERY_RESTART
)
389 info
->poll_sleep
= nom_sleep_opt
;
391 info
->poll_sleep
= min_sleep_opt
;
396 * Transition the server to the DNS lookup successful state.
397 * Note that the server state does not transition out of
398 * lookup successful if we relookup after a packet failure
399 * so the message is printed only once, usually.
401 client_setserverstate(info
, 0, "DNS lookup success");
404 * If we've failed many times switch to the startup state but
405 * do not fall through into it. break the switch and a single
406 * poll will be made after the nominal polling interval.
408 if (info
->poll_failed
>= POLL_RECOVERY_RESTART
* 5) {
409 logdebuginfo(info
, 2, "polling mode INIT->STARTUP (very slow)\n");
410 info
->poll_mode
= POLL_STARTUP
;
411 info
->poll_sleep
= max_sleep_opt
;
412 info
->poll_count
= 0;
414 } else if (info
->poll_failed
>= POLL_RECOVERY_RESTART
) {
415 logdebuginfo(info
, 2, "polling mode INIT->STARTUP (slow)\n");
416 info
->poll_mode
= POLL_STARTUP
;
417 info
->poll_count
= 0;
423 * Fall through to the startup state.
425 info
->poll_mode
= POLL_STARTUP
;
426 logdebuginfo(info
, 2, "polling mode INIT->STARTUP (normal)\n");
430 * Transition to a FAILED state if too many poll failures occured.
432 if (info
->poll_failed
>= POLL_FAIL_RESET
) {
433 logdebuginfo(info
, 2, "polling mode STARTUP->FAILED\n");
434 info
->poll_mode
= POLL_FAILED
;
435 info
->poll_count
= 0;
440 * Transition the server to operational. Do a number of minimum
441 * interval polls to try to get a good offset calculation quickly.
443 if (info
->poll_count
)
444 client_setserverstate(info
, 1, "connected ok");
445 if (info
->poll_count
< POLL_STARTUP_MAX
) {
446 info
->poll_sleep
= min_sleep_opt
;
451 * Once we've got our polls fall through to aquisition mode to
452 * do aquisition processing.
454 info
->poll_mode
= POLL_ACQUIRE
;
455 info
->poll_count
= 0;
456 logdebuginfo(info
, 2, "polling mode STARTUP->ACQUIRE\n");
460 * Transition to a FAILED state if too many poll failures occured.
462 if (info
->poll_failed
>= POLL_FAIL_RESET
) {
463 logdebuginfo(info
, 2, "polling mode STARTUP->FAILED\n");
464 info
->poll_mode
= POLL_FAILED
;
465 info
->poll_count
= 0;
470 * Acquisition mode using the nominal timeout. We do not shift
471 * to maintainance mode unless the correlation is at least 0.90
473 if (info
->poll_count
< POLL_ACQUIRE_MAX
||
474 info
->lin_count
< 8 ||
475 fabs(info
->lin_cache_corr
) < 0.85
477 if (info
->poll_count
>= POLL_ACQUIRE_MAX
&&
478 info
->lin_count
== LIN_RESTART
- 2
480 logdebuginfo(info
, 2,
481 "WARNING: Unable to shift this source to "
482 "maintenance mode. Target correlation is aweful\n");
486 info
->poll_mode
= POLL_MAINTAIN
;
487 info
->poll_count
= 0;
488 logdebuginfo(info
, 2, "polling mode ACQUIRE->MAINTAIN\n");
492 * Transition to a FAILED state if too many poll failures occured.
494 if (info
->poll_failed
>= POLL_FAIL_RESET
) {
495 logdebuginfo(info
, 2, "polling mode STARTUP->FAILED\n");
496 info
->poll_mode
= POLL_FAILED
;
497 info
->poll_count
= 0;
502 * Maintaince mode, max polling interval.
504 * Transition back to acquisition mode if we are unable to maintain
505 * this mode due to the correlation going bad.
507 if (info
->lin_count
>= LIN_RESTART
/ 2 &&
508 fabs(info
->lin_cache_corr
) < 0.70
510 logdebuginfo(info
, 2,
511 "polling mode MAINTAIN->ACQUIRE. Unable to maintain\n"
512 "the maintenance mode because the correlation went"
514 info
->poll_mode
= POLL_ACQUIRE
;
515 info
->poll_count
= 0;
518 info
->poll_sleep
= max_sleep_opt
;
522 * We have a communications failure. A late recovery is possible
523 * if we enter this state with a good poll.
525 if (info
->poll_count
!= 0) {
526 logdebuginfo(info
, 2, "polling mode FAILED->ACQUIRE\n");
527 if (info
->poll_failed
>= POLL_FAIL_RESET
)
528 info
->poll_mode
= POLL_STARTUP
;
530 info
->poll_mode
= POLL_ACQUIRE
;
531 /* do not reset poll_count */
536 * If we have been failed too long, disconnect from the server
537 * and start us all over again. Note that the failed count is not
540 if (info
->poll_failed
>= POLL_RECOVERY_RESTART
) {
541 logdebuginfo(info
, 2, "polling mode FAILED->INIT\n");
542 client_setserverstate(info
, 0, "FAILED");
543 disconnect_server(info
);
544 info
->poll_mode
= POLL_FIXED
;
551 * If the above state machine has not set a polling interval, set a
552 * nominal polling interval.
554 if (info
->poll_sleep
== 0)
555 info
->poll_sleep
= nom_sleep_opt
;
559 * Look for duplicate IP addresses. This is done very inoften, so we do
560 * not use a particularly efficient algorithm.
562 * Only reconnect a client which has not done its initial poll.
565 client_check_duplicate_ips(struct server_info
**info_ary
, int count
)
573 for (i
= 0; i
< count
; ++i
) {
575 if (info1
->fd
< 0 || info1
->server_state
!= 0)
577 for (tries
= 0; tries
< 10; ++tries
) {
578 for (j
= 0; j
< count
; ++j
) {
580 if (i
== j
|| info2
->fd
< 0)
582 if (strcmp(info1
->ipstr
, info2
->ipstr
) == 0) {
583 reconnect_server(info1
);
591 disconnect_server(info1
);
592 client_setserverstate(info1
, -2,
593 "permanently disabling duplicate server");
599 * Calculate whether the server pointed to by *bestp is insane or not.
600 * For some reason some servers in e.g. the ntp pool are sometimes an hour
601 * off. If we have at least three servers in the pool require that a
602 * quorum agree that the current best server's offset is reasonable.
604 * Allow +/- 0.5 seconds of error for now (settable with option).
606 * Returns -1 if insane, 0 if not enough samples, and 1 if ok
610 client_insane(struct server_info
**info_ary
, int count
, server_info_t best
)
622 * If only one ntp server we cannot check to see if it is insane
626 best_offset
= best
->lin_sumoffset
/ best
->lin_countoffset
;
629 * Calculated the quorum. Do not count permanently failed servers
630 * in the calculation.
632 * adjusted count quorum
639 for (i
= 0; i
< count
; ++i
) {
641 if (info
->server_state
== -2)
645 quorum
= quorum
/ 2 + 1;
651 * Find the good, the bad, and the ugly. We need at least four samples
652 * and a stddev within the deviation being checked to count a server
653 * in the calculation.
655 for (i
= 0; i
< count
; ++i
) {
657 if (info
->lin_countoffset
< 4 ||
658 info
->lin_cache_stddev
> insane_deviation
664 info_offset
= info
->lin_sumoffset
/ info
->lin_countoffset
;
665 info_offset
-= best_offset
;
666 if (info_offset
< -insane_deviation
|| info_offset
> insane_deviation
)
673 * Did we meet our quorum?
675 logdebuginfo(best
, 5, "insanecheck good=%d bad=%d skip=%d "
676 "quorum=%d (allowed=%-+8.6f)\n",
677 good
, bad
, skip
, quorum
, insane_deviation
);
680 if (good
+ skip
>= quorum
)
688 * ltv local time as of when the offset error was calculated between
689 * local time and remote time.
691 * lbtv base time as of when local time was obtained. Used to
692 * calculate the cumulative corrections made to the system's
693 * real time clock so we can de-correct the offset for the
696 * X is the time axis, in seconds.
697 * Y is the uncorrected offset, in seconds.
700 lin_regress(server_info_t info
, struct timeval
*ltv
, struct timeval
*lbtv
,
701 double offset
, int calc_offset_correction
)
704 double uncorrected_offset
;
707 * De-correcting the offset:
709 * The passed offset is (our_real_time - remote_real_time). To remove
710 * corrections from our_real_time we take the difference in the basetime
711 * (new_base_time - old_base_time) and subtract that from the offset.
712 * That is, if the basetime goesup, the uncorrected offset goes down.
714 if (info
->lin_count
== 0) {
716 info
->lin_btv
= *lbtv
;
718 uncorrected_offset
= offset
;
720 time_axis
= tv_delta_double(&info
->lin_tv
, ltv
);
721 uncorrected_offset
= offset
- tv_delta_double(&info
->lin_btv
, lbtv
);
725 * We have to use the uncorrected offset for frequency calculations.
728 info
->lin_sumx
+= time_axis
;
729 info
->lin_sumx2
+= time_axis
* time_axis
;
730 info
->lin_sumy
+= uncorrected_offset
;
731 info
->lin_sumy2
+= uncorrected_offset
* uncorrected_offset
;
732 info
->lin_sumxy
+= time_axis
* uncorrected_offset
;
735 * We have to use the corrected offset for offset calculations.
737 if (calc_offset_correction
) {
738 ++info
->lin_countoffset
;
739 info
->lin_sumoffset
+= offset
;
740 info
->lin_sumoffset2
+= offset
* offset
;
744 * Calculate various derived values. This gets us slope, y-intercept,
745 * and correlation from the linear regression.
747 if (info
->lin_count
> 1) {
748 info
->lin_cache_slope
=
749 (info
->lin_count
* info
->lin_sumxy
- info
->lin_sumx
* info
->lin_sumy
) /
750 (info
->lin_count
* info
->lin_sumx2
- info
->lin_sumx
* info
->lin_sumx
);
752 info
->lin_cache_yint
=
753 (info
->lin_sumy
- info
->lin_cache_slope
* info
->lin_sumx
) /
756 info
->lin_cache_corr
=
757 (info
->lin_count
* info
->lin_sumxy
- info
->lin_sumx
* info
->lin_sumy
) /
758 sqrt((info
->lin_count
* info
->lin_sumx2
-
759 info
->lin_sumx
* info
->lin_sumx
) *
760 (info
->lin_count
* info
->lin_sumy2
-
761 info
->lin_sumy
* info
->lin_sumy
)
766 * Calculate more derived values. This gets us the standard-deviation
767 * of offsets. The standard deviation approximately means that 68%
768 * of the samples fall within the calculated stddev of the mean.
770 if (info
->lin_countoffset
> 1) {
771 info
->lin_cache_stddev
=
772 sqrt((info
->lin_sumoffset2
-
773 ((info
->lin_sumoffset
* info
->lin_sumoffset
/
774 info
->lin_countoffset
))) /
775 (info
->lin_countoffset
- 1.0));
779 * Save the most recent offset, we might use it in the future.
780 * Save the frequency correction (we might scale the slope later so
781 * we have a separate field for the actual frequency correction in
782 * seconds per second).
784 info
->lin_cache_offset
= offset
;
785 info
->lin_cache_freq
= info
->lin_cache_slope
;
787 if (debug_level
>= 4) {
788 logdebuginfo(info
, 4, "iter=%2d time=%7.3f off=%+.6f uoff=%+.6f",
789 (int)info
->lin_count
,
790 time_axis
, offset
, uncorrected_offset
);
791 if (info
->lin_count
> 1) {
792 logdebug(4, " slope %+7.6f"
793 " yint %+3.2f corr %+7.6f freq_ppm %+4.2f",
794 info
->lin_cache_slope
,
795 info
->lin_cache_yint
,
796 info
->lin_cache_corr
,
797 info
->lin_cache_freq
* 1000000.0);
799 if (info
->lin_countoffset
> 1) {
800 logdebug(4, " stddev %7.6f", info
->lin_cache_stddev
);
801 } else if (calc_offset_correction
== 0) {
802 /* cannot calculate offset correction due to prior correction */
803 logdebug(4, " offset_ignored");
810 * Reset the linear regression data. The info structure will not again be
811 * a candidate for frequency or offset correction until sufficient data
812 * has been accumulated to make a decision.
815 lin_reset(server_info_t info
)
826 info
->lin_countoffset
= 0;
827 info
->lin_sumoffset
= 0;
828 info
->lin_sumoffset2
= 0;
830 info
->lin_cache_slope
= 0;
831 info
->lin_cache_yint
= 0;
832 info
->lin_cache_corr
= 0;
833 info
->lin_cache_offset
= 0;
834 info
->lin_cache_freq
= 0;
837 * Destroy any additional alternative regressions.
839 while ((scan
= info
->altinfo
) != NULL
) {
840 info
->altinfo
= scan
->altinfo
;
846 * Sometimes we want to clean out the offset calculations without
847 * destroying the linear regression used to figure out the frequency
848 * correction. This usually occurs whenever we issue an offset
849 * adjustment to the system, which invalidates any offset data accumulated
853 lin_resetalloffsets(struct server_info
**info_ary
, int count
)
858 for (i
= 0; i
< count
; ++i
) {
859 for (info
= info_ary
[i
]; info
; info
= info
->altinfo
)
860 lin_resetoffsets(info
);
865 lin_resetoffsets(server_info_t info
)
867 info
->lin_countoffset
= 0;
868 info
->lin_sumoffset
= 0;
869 info
->lin_sumoffset2
= 0;
873 client_setserverstate(server_info_t info
, int state
, const char *str
)
875 if (info
->server_state
!= state
) {
876 info
->server_state
= state
;
877 logdebuginfo(info
, 1, "%s\n", str
);