2 * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
3 * (a.k.a. Fault Tolerance or Continuous Replication)
5 * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
6 * Copyright (c) 2016 FUJITSU LIMITED
7 * Copyright (c) 2016 Intel Corporation
9 * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or
12 * later. See the COPYING file in the top-level directory.
15 #include "qemu/osdep.h"
16 #include "qemu/error-report.h"
18 #include "qemu-common.h"
19 #include "qapi/qmp/qerror.h"
20 #include "qapi/error.h"
23 #include "qom/object_interfaces.h"
25 #include "qom/object.h"
26 #include "qemu/typedefs.h"
27 #include "net/queue.h"
28 #include "sysemu/char.h"
29 #include "qemu/sockets.h"
30 #include "qapi-visit.h"
33 #define TYPE_COLO_COMPARE "colo-compare"
34 #define COLO_COMPARE(obj) \
35 OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
37 #define COMPARE_READ_LEN_MAX NET_BUFSIZE
38 #define MAX_QUEUE_SIZE 1024
40 /* TODO: Should be configurable */
41 #define REGULAR_PACKET_CHECK_MS 3000
46 +---------------+ +---------------+ +---------------+
47 |conn list +--->conn +--------->conn |
48 +---------------+ +---------------+ +---------------+
50 +---------------+ +---v----+ +---v----+ +---v----+ +---v----+
51 |primary | |secondary |primary | |secondary
52 |packet | |packet + |packet | |packet +
53 +--------+ +--------+ +--------+ +--------+
55 +---v----+ +---v----+ +---v----+ +---v----+
56 |primary | |secondary |primary | |secondary
57 |packet | |packet + |packet | |packet +
58 +--------+ +--------+ +--------+ +--------+
60 +---v----+ +---v----+ +---v----+ +---v----+
61 |primary | |secondary |primary | |secondary
62 |packet | |packet + |packet | |packet +
63 +--------+ +--------+ +--------+ +--------+
65 typedef struct CompareState
{
71 CharBackend chr_pri_in
;
72 CharBackend chr_sec_in
;
74 SocketReadState pri_rs
;
75 SocketReadState sec_rs
;
77 /* connection list: the connections belonged to this NIC could be found
79 * element type: Connection
82 /* hashtable to save connection */
83 GHashTable
*connection_track_table
;
84 /* compare thread, a thread for each NIC */
87 GMainContext
*worker_context
;
88 GMainLoop
*compare_loop
;
91 typedef struct CompareClass
{
92 ObjectClass parent_class
;
100 static int compare_chr_send(CharBackend
*out
,
104 static gint
seq_sorter(Packet
*a
, Packet
*b
, gpointer data
)
106 struct tcphdr
*atcp
, *btcp
;
108 atcp
= (struct tcphdr
*)(a
->transport_header
);
109 btcp
= (struct tcphdr
*)(b
->transport_header
);
110 return ntohl(atcp
->th_seq
) - ntohl(btcp
->th_seq
);
114 * Return 0 on success, if return -1 means the pkt
115 * is unsupported(arp and ipv6) and will be sent later
117 static int packet_enqueue(CompareState
*s
, int mode
)
123 if (mode
== PRIMARY_IN
) {
124 pkt
= packet_new(s
->pri_rs
.buf
, s
->pri_rs
.packet_len
);
126 pkt
= packet_new(s
->sec_rs
.buf
, s
->sec_rs
.packet_len
);
129 if (parse_packet_early(pkt
)) {
130 packet_destroy(pkt
, NULL
);
134 fill_connection_key(pkt
, &key
);
136 conn
= connection_get(s
->connection_track_table
,
140 if (!conn
->processing
) {
141 g_queue_push_tail(&s
->conn_list
, conn
);
142 conn
->processing
= true;
145 if (mode
== PRIMARY_IN
) {
146 if (g_queue_get_length(&conn
->primary_list
) <=
148 g_queue_push_tail(&conn
->primary_list
, pkt
);
149 if (conn
->ip_proto
== IPPROTO_TCP
) {
150 g_queue_sort(&conn
->primary_list
,
151 (GCompareDataFunc
)seq_sorter
,
155 error_report("colo compare primary queue size too big,"
159 if (g_queue_get_length(&conn
->secondary_list
) <=
161 g_queue_push_tail(&conn
->secondary_list
, pkt
);
162 if (conn
->ip_proto
== IPPROTO_TCP
) {
163 g_queue_sort(&conn
->secondary_list
,
164 (GCompareDataFunc
)seq_sorter
,
168 error_report("colo compare secondary queue size too big,"
177 * The IP packets sent by primary and secondary
178 * will be compared in here
179 * TODO support ip fragment, Out-Of-Order
180 * return: 0 means packet same
181 * > 0 || < 0 means packet different
183 static int colo_packet_compare_common(Packet
*ppkt
, Packet
*spkt
, int offset
)
185 if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE
)) {
186 char pri_ip_src
[20], pri_ip_dst
[20], sec_ip_src
[20], sec_ip_dst
[20];
188 strcpy(pri_ip_src
, inet_ntoa(ppkt
->ip
->ip_src
));
189 strcpy(pri_ip_dst
, inet_ntoa(ppkt
->ip
->ip_dst
));
190 strcpy(sec_ip_src
, inet_ntoa(spkt
->ip
->ip_src
));
191 strcpy(sec_ip_dst
, inet_ntoa(spkt
->ip
->ip_dst
));
193 trace_colo_compare_ip_info(ppkt
->size
, pri_ip_src
,
194 pri_ip_dst
, spkt
->size
,
195 sec_ip_src
, sec_ip_dst
);
198 if (ppkt
->size
== spkt
->size
) {
199 return memcmp(ppkt
->data
+ offset
, spkt
->data
+ offset
,
200 spkt
->size
- offset
);
202 trace_colo_compare_main("Net packet size are not the same");
208 * Called from the compare thread on the primary
209 * for compare tcp packet
210 * compare_tcp copied from Dr. David Alan Gilbert's branch
212 static int colo_packet_compare_tcp(Packet
*spkt
, Packet
*ppkt
)
214 struct tcphdr
*ptcp
, *stcp
;
217 trace_colo_compare_main("compare tcp");
219 ptcp
= (struct tcphdr
*)ppkt
->transport_header
;
220 stcp
= (struct tcphdr
*)spkt
->transport_header
;
223 * The 'identification' field in the IP header is *very* random
224 * it almost never matches. Fudge this by ignoring differences in
225 * unfragmented packets; they'll normally sort themselves out if different
226 * anyway, and it should recover at the TCP level.
227 * An alternative would be to get both the primary and secondary to rewrite
228 * somehow; but that would need some sync traffic to sync the state
230 if (ntohs(ppkt
->ip
->ip_off
) & IP_DF
) {
231 spkt
->ip
->ip_id
= ppkt
->ip
->ip_id
;
232 /* and the sum will be different if the IDs were different */
233 spkt
->ip
->ip_sum
= ppkt
->ip
->ip_sum
;
236 if (ptcp
->th_sum
== stcp
->th_sum
) {
237 res
= colo_packet_compare_common(ppkt
, spkt
, ETH_HLEN
);
242 if (res
!= 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE
)) {
243 trace_colo_compare_pkt_info_src(inet_ntoa(ppkt
->ip
->ip_src
),
249 trace_colo_compare_pkt_info_dst(inet_ntoa(ppkt
->ip
->ip_dst
),
255 qemu_hexdump((char *)ppkt
->data
, stderr
,
256 "colo-compare ppkt", ppkt
->size
);
257 qemu_hexdump((char *)spkt
->data
, stderr
,
258 "colo-compare spkt", spkt
->size
);
265 * Called from the compare thread on the primary
266 * for compare udp packet
268 static int colo_packet_compare_udp(Packet
*spkt
, Packet
*ppkt
)
271 int network_header_length
= ppkt
->ip
->ip_hl
* 4;
273 trace_colo_compare_main("compare udp");
276 * Because of ppkt and spkt are both in the same connection,
277 * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
278 * same with spkt. In addition, IP header's Identification is a random
279 * field, we can handle it in IP fragmentation function later.
280 * COLO just concern the response net packet payload from primary guest
281 * and secondary guest are same or not, So we ignored all IP header include
282 * other field like TOS,TTL,IP Checksum. we only need to compare
283 * the ip payload here.
285 ret
= colo_packet_compare_common(ppkt
, spkt
,
286 network_header_length
+ ETH_HLEN
);
289 trace_colo_compare_udp_miscompare("primary pkt size", ppkt
->size
);
290 trace_colo_compare_udp_miscompare("Secondary pkt size", spkt
->size
);
291 if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE
)) {
292 qemu_hexdump((char *)ppkt
->data
, stderr
, "colo-compare pri pkt",
294 qemu_hexdump((char *)spkt
->data
, stderr
, "colo-compare sec pkt",
303 * Called from the compare thread on the primary
304 * for compare icmp packet
306 static int colo_packet_compare_icmp(Packet
*spkt
, Packet
*ppkt
)
308 int network_header_length
= ppkt
->ip
->ip_hl
* 4;
310 trace_colo_compare_main("compare icmp");
313 * Because of ppkt and spkt are both in the same connection,
314 * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
315 * same with spkt. In addition, IP header's Identification is a random
316 * field, we can handle it in IP fragmentation function later.
317 * COLO just concern the response net packet payload from primary guest
318 * and secondary guest are same or not, So we ignored all IP header include
319 * other field like TOS,TTL,IP Checksum. we only need to compare
320 * the ip payload here.
322 if (colo_packet_compare_common(ppkt
, spkt
,
323 network_header_length
+ ETH_HLEN
)) {
324 trace_colo_compare_icmp_miscompare("primary pkt size",
326 trace_colo_compare_icmp_miscompare("Secondary pkt size",
328 if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE
)) {
329 qemu_hexdump((char *)ppkt
->data
, stderr
, "colo-compare pri pkt",
331 qemu_hexdump((char *)spkt
->data
, stderr
, "colo-compare sec pkt",
341 * Called from the compare thread on the primary
342 * for compare other packet
344 static int colo_packet_compare_other(Packet
*spkt
, Packet
*ppkt
)
346 trace_colo_compare_main("compare other");
347 if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE
)) {
348 char pri_ip_src
[20], pri_ip_dst
[20], sec_ip_src
[20], sec_ip_dst
[20];
350 strcpy(pri_ip_src
, inet_ntoa(ppkt
->ip
->ip_src
));
351 strcpy(pri_ip_dst
, inet_ntoa(ppkt
->ip
->ip_dst
));
352 strcpy(sec_ip_src
, inet_ntoa(spkt
->ip
->ip_src
));
353 strcpy(sec_ip_dst
, inet_ntoa(spkt
->ip
->ip_dst
));
355 trace_colo_compare_ip_info(ppkt
->size
, pri_ip_src
,
356 pri_ip_dst
, spkt
->size
,
357 sec_ip_src
, sec_ip_dst
);
360 return colo_packet_compare_common(ppkt
, spkt
, 0);
363 static int colo_old_packet_check_one(Packet
*pkt
, int64_t *check_time
)
365 int64_t now
= qemu_clock_get_ms(QEMU_CLOCK_HOST
);
367 if ((now
- pkt
->creation_ms
) > (*check_time
)) {
368 trace_colo_old_packet_check_found(pkt
->creation_ms
);
375 static void colo_old_packet_check_one_conn(void *opaque
,
378 Connection
*conn
= opaque
;
379 GList
*result
= NULL
;
380 int64_t check_time
= REGULAR_PACKET_CHECK_MS
;
382 result
= g_queue_find_custom(&conn
->primary_list
,
384 (GCompareFunc
)colo_old_packet_check_one
);
387 /* do checkpoint will flush old packet */
388 /* TODO: colo_notify_checkpoint();*/
393 * Look for old packets that the secondary hasn't matched,
394 * if we have some then we have to checkpoint to wake
397 static void colo_old_packet_check(void *opaque
)
399 CompareState
*s
= opaque
;
401 g_queue_foreach(&s
->conn_list
, colo_old_packet_check_one_conn
, NULL
);
405 * Called from the compare thread on the primary
406 * for compare connection
408 static void colo_compare_connection(void *opaque
, void *user_data
)
410 CompareState
*s
= user_data
;
411 Connection
*conn
= opaque
;
413 GList
*result
= NULL
;
416 while (!g_queue_is_empty(&conn
->primary_list
) &&
417 !g_queue_is_empty(&conn
->secondary_list
)) {
418 pkt
= g_queue_pop_tail(&conn
->primary_list
);
419 switch (conn
->ip_proto
) {
421 result
= g_queue_find_custom(&conn
->secondary_list
,
422 pkt
, (GCompareFunc
)colo_packet_compare_tcp
);
425 result
= g_queue_find_custom(&conn
->secondary_list
,
426 pkt
, (GCompareFunc
)colo_packet_compare_udp
);
429 result
= g_queue_find_custom(&conn
->secondary_list
,
430 pkt
, (GCompareFunc
)colo_packet_compare_icmp
);
433 result
= g_queue_find_custom(&conn
->secondary_list
,
434 pkt
, (GCompareFunc
)colo_packet_compare_other
);
439 ret
= compare_chr_send(&s
->chr_out
, pkt
->data
, pkt
->size
);
441 error_report("colo_send_primary_packet failed");
443 trace_colo_compare_main("packet same and release packet");
444 g_queue_remove(&conn
->secondary_list
, result
->data
);
445 packet_destroy(pkt
, NULL
);
448 * If one packet arrive late, the secondary_list or
449 * primary_list will be empty, so we can't compare it
450 * until next comparison.
452 trace_colo_compare_main("packet different");
453 g_queue_push_tail(&conn
->primary_list
, pkt
);
454 /* TODO: colo_notify_checkpoint();*/
460 static int compare_chr_send(CharBackend
*out
,
465 uint32_t len
= htonl(size
);
471 ret
= qemu_chr_fe_write_all(out
, (uint8_t *)&len
, sizeof(len
));
472 if (ret
!= sizeof(len
)) {
476 ret
= qemu_chr_fe_write_all(out
, (uint8_t *)buf
, size
);
484 return ret
< 0 ? ret
: -EIO
;
487 static int compare_chr_can_read(void *opaque
)
489 return COMPARE_READ_LEN_MAX
;
493 * Called from the main thread on the primary for packets
494 * arriving over the socket from the primary.
496 static void compare_pri_chr_in(void *opaque
, const uint8_t *buf
, int size
)
498 CompareState
*s
= COLO_COMPARE(opaque
);
501 ret
= net_fill_rstate(&s
->pri_rs
, buf
, size
);
503 qemu_chr_fe_set_handlers(&s
->chr_pri_in
, NULL
, NULL
, NULL
,
505 error_report("colo-compare primary_in error");
510 * Called from the main thread on the primary for packets
511 * arriving over the socket from the secondary.
513 static void compare_sec_chr_in(void *opaque
, const uint8_t *buf
, int size
)
515 CompareState
*s
= COLO_COMPARE(opaque
);
518 ret
= net_fill_rstate(&s
->sec_rs
, buf
, size
);
520 qemu_chr_fe_set_handlers(&s
->chr_sec_in
, NULL
, NULL
, NULL
,
522 error_report("colo-compare secondary_in error");
527 * Check old packet regularly so it can watch for any packets
528 * that the secondary hasn't produced equivalents of.
530 static gboolean
check_old_packet_regular(void *opaque
)
532 CompareState
*s
= opaque
;
534 /* if have old packet we will notify checkpoint */
535 colo_old_packet_check(s
);
540 static void *colo_compare_thread(void *opaque
)
542 CompareState
*s
= opaque
;
543 GSource
*timeout_source
;
545 s
->worker_context
= g_main_context_new();
547 qemu_chr_fe_set_handlers(&s
->chr_pri_in
, compare_chr_can_read
,
548 compare_pri_chr_in
, NULL
, s
, s
->worker_context
, true);
549 qemu_chr_fe_set_handlers(&s
->chr_sec_in
, compare_chr_can_read
,
550 compare_sec_chr_in
, NULL
, s
, s
->worker_context
, true);
552 s
->compare_loop
= g_main_loop_new(s
->worker_context
, FALSE
);
554 /* To kick any packets that the secondary doesn't match */
555 timeout_source
= g_timeout_source_new(REGULAR_PACKET_CHECK_MS
);
556 g_source_set_callback(timeout_source
,
557 (GSourceFunc
)check_old_packet_regular
, s
, NULL
);
558 g_source_attach(timeout_source
, s
->worker_context
);
560 g_main_loop_run(s
->compare_loop
);
562 g_source_unref(timeout_source
);
563 g_main_loop_unref(s
->compare_loop
);
564 g_main_context_unref(s
->worker_context
);
568 static char *compare_get_pri_indev(Object
*obj
, Error
**errp
)
570 CompareState
*s
= COLO_COMPARE(obj
);
572 return g_strdup(s
->pri_indev
);
575 static void compare_set_pri_indev(Object
*obj
, const char *value
, Error
**errp
)
577 CompareState
*s
= COLO_COMPARE(obj
);
579 g_free(s
->pri_indev
);
580 s
->pri_indev
= g_strdup(value
);
583 static char *compare_get_sec_indev(Object
*obj
, Error
**errp
)
585 CompareState
*s
= COLO_COMPARE(obj
);
587 return g_strdup(s
->sec_indev
);
590 static void compare_set_sec_indev(Object
*obj
, const char *value
, Error
**errp
)
592 CompareState
*s
= COLO_COMPARE(obj
);
594 g_free(s
->sec_indev
);
595 s
->sec_indev
= g_strdup(value
);
598 static char *compare_get_outdev(Object
*obj
, Error
**errp
)
600 CompareState
*s
= COLO_COMPARE(obj
);
602 return g_strdup(s
->outdev
);
605 static void compare_set_outdev(Object
*obj
, const char *value
, Error
**errp
)
607 CompareState
*s
= COLO_COMPARE(obj
);
610 s
->outdev
= g_strdup(value
);
613 static void compare_pri_rs_finalize(SocketReadState
*pri_rs
)
615 CompareState
*s
= container_of(pri_rs
, CompareState
, pri_rs
);
617 if (packet_enqueue(s
, PRIMARY_IN
)) {
618 trace_colo_compare_main("primary: unsupported packet in");
619 compare_chr_send(&s
->chr_out
, pri_rs
->buf
, pri_rs
->packet_len
);
621 /* compare connection */
622 g_queue_foreach(&s
->conn_list
, colo_compare_connection
, s
);
626 static void compare_sec_rs_finalize(SocketReadState
*sec_rs
)
628 CompareState
*s
= container_of(sec_rs
, CompareState
, sec_rs
);
630 if (packet_enqueue(s
, SECONDARY_IN
)) {
631 trace_colo_compare_main("secondary: unsupported packet in");
633 /* compare connection */
634 g_queue_foreach(&s
->conn_list
, colo_compare_connection
, s
);
640 * Return 0 is success.
641 * Return 1 is failed.
643 static int find_and_check_chardev(Chardev
**chr
,
647 *chr
= qemu_chr_find(chr_name
);
649 error_setg(errp
, "Device '%s' not found",
654 if (!qemu_chr_has_feature(*chr
, QEMU_CHAR_FEATURE_RECONNECTABLE
)) {
655 error_setg(errp
, "chardev \"%s\" is not reconnectable",
664 * Called from the main thread on the primary
665 * to setup colo-compare.
667 static void colo_compare_complete(UserCreatable
*uc
, Error
**errp
)
669 CompareState
*s
= COLO_COMPARE(uc
);
671 char thread_name
[64];
672 static int compare_id
;
674 if (!s
->pri_indev
|| !s
->sec_indev
|| !s
->outdev
) {
675 error_setg(errp
, "colo compare needs 'primary_in' ,"
676 "'secondary_in','outdev' property set");
678 } else if (!strcmp(s
->pri_indev
, s
->outdev
) ||
679 !strcmp(s
->sec_indev
, s
->outdev
) ||
680 !strcmp(s
->pri_indev
, s
->sec_indev
)) {
681 error_setg(errp
, "'indev' and 'outdev' could not be same "
682 "for compare module");
686 if (find_and_check_chardev(&chr
, s
->pri_indev
, errp
) ||
687 !qemu_chr_fe_init(&s
->chr_pri_in
, chr
, errp
)) {
691 if (find_and_check_chardev(&chr
, s
->sec_indev
, errp
) ||
692 !qemu_chr_fe_init(&s
->chr_sec_in
, chr
, errp
)) {
696 if (find_and_check_chardev(&chr
, s
->outdev
, errp
) ||
697 !qemu_chr_fe_init(&s
->chr_out
, chr
, errp
)) {
701 net_socket_rs_init(&s
->pri_rs
, compare_pri_rs_finalize
);
702 net_socket_rs_init(&s
->sec_rs
, compare_sec_rs_finalize
);
704 g_queue_init(&s
->conn_list
);
706 s
->connection_track_table
= g_hash_table_new_full(connection_key_hash
,
707 connection_key_equal
,
711 sprintf(thread_name
, "colo-compare %d", compare_id
);
712 qemu_thread_create(&s
->thread
, thread_name
,
713 colo_compare_thread
, s
,
714 QEMU_THREAD_JOINABLE
);
720 static void colo_flush_packets(void *opaque
, void *user_data
)
722 CompareState
*s
= user_data
;
723 Connection
*conn
= opaque
;
726 while (!g_queue_is_empty(&conn
->primary_list
)) {
727 pkt
= g_queue_pop_head(&conn
->primary_list
);
728 compare_chr_send(&s
->chr_out
, pkt
->data
, pkt
->size
);
729 packet_destroy(pkt
, NULL
);
731 while (!g_queue_is_empty(&conn
->secondary_list
)) {
732 pkt
= g_queue_pop_head(&conn
->secondary_list
);
733 packet_destroy(pkt
, NULL
);
737 static void colo_compare_class_init(ObjectClass
*oc
, void *data
)
739 UserCreatableClass
*ucc
= USER_CREATABLE_CLASS(oc
);
741 ucc
->complete
= colo_compare_complete
;
744 static void colo_compare_init(Object
*obj
)
746 object_property_add_str(obj
, "primary_in",
747 compare_get_pri_indev
, compare_set_pri_indev
,
749 object_property_add_str(obj
, "secondary_in",
750 compare_get_sec_indev
, compare_set_sec_indev
,
752 object_property_add_str(obj
, "outdev",
753 compare_get_outdev
, compare_set_outdev
,
757 static void colo_compare_finalize(Object
*obj
)
759 CompareState
*s
= COLO_COMPARE(obj
);
761 qemu_chr_fe_set_handlers(&s
->chr_pri_in
, NULL
, NULL
, NULL
, NULL
,
762 s
->worker_context
, true);
763 qemu_chr_fe_set_handlers(&s
->chr_sec_in
, NULL
, NULL
, NULL
, NULL
,
764 s
->worker_context
, true);
765 qemu_chr_fe_deinit(&s
->chr_out
);
767 g_main_loop_quit(s
->compare_loop
);
768 qemu_thread_join(&s
->thread
);
770 /* Release all unhandled packets after compare thead exited */
771 g_queue_foreach(&s
->conn_list
, colo_flush_packets
, s
);
773 g_queue_clear(&s
->conn_list
);
775 g_hash_table_destroy(s
->connection_track_table
);
776 g_free(s
->pri_indev
);
777 g_free(s
->sec_indev
);
781 static const TypeInfo colo_compare_info
= {
782 .name
= TYPE_COLO_COMPARE
,
783 .parent
= TYPE_OBJECT
,
784 .instance_size
= sizeof(CompareState
),
785 .instance_init
= colo_compare_init
,
786 .instance_finalize
= colo_compare_finalize
,
787 .class_size
= sizeof(CompareClass
),
788 .class_init
= colo_compare_class_init
,
789 .interfaces
= (InterfaceInfo
[]) {
790 { TYPE_USER_CREATABLE
},
795 static void register_types(void)
797 type_register_static(&colo_compare_info
);
800 type_init(register_types
);