1 /* Copyright (c) 2016 Facebook
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
8 #include <linux/jhash.h>
9 #include <linux/filter.h>
10 #include <linux/stacktrace.h>
11 #include <linux/perf_event.h>
12 #include "percpu_freelist.h"
14 #define STACK_CREATE_FLAG_MASK \
15 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
17 struct stack_map_bucket
{
18 struct pcpu_freelist_node fnode
;
24 struct bpf_stack_map
{
27 struct pcpu_freelist freelist
;
29 struct stack_map_bucket
*buckets
[];
32 static int prealloc_elems_and_freelist(struct bpf_stack_map
*smap
)
34 u32 elem_size
= sizeof(struct stack_map_bucket
) + smap
->map
.value_size
;
37 smap
->elems
= bpf_map_area_alloc(elem_size
* smap
->map
.max_entries
,
42 err
= pcpu_freelist_init(&smap
->freelist
);
46 pcpu_freelist_populate(&smap
->freelist
, smap
->elems
, elem_size
,
47 smap
->map
.max_entries
);
51 bpf_map_area_free(smap
->elems
);
55 /* Called from syscall */
56 static struct bpf_map
*stack_map_alloc(union bpf_attr
*attr
)
58 u32 value_size
= attr
->value_size
;
59 struct bpf_stack_map
*smap
;
63 if (!capable(CAP_SYS_ADMIN
))
64 return ERR_PTR(-EPERM
);
66 if (attr
->map_flags
& ~STACK_CREATE_FLAG_MASK
)
67 return ERR_PTR(-EINVAL
);
69 /* check sanity of attributes */
70 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
71 value_size
< 8 || value_size
% 8 ||
72 value_size
/ 8 > sysctl_perf_event_max_stack
)
73 return ERR_PTR(-EINVAL
);
75 /* hash table size must be power of 2 */
76 n_buckets
= roundup_pow_of_two(attr
->max_entries
);
78 cost
= n_buckets
* sizeof(struct stack_map_bucket
*) + sizeof(*smap
);
79 if (cost
>= U32_MAX
- PAGE_SIZE
)
80 return ERR_PTR(-E2BIG
);
82 smap
= bpf_map_area_alloc(cost
, bpf_map_attr_numa_node(attr
));
84 return ERR_PTR(-ENOMEM
);
87 cost
+= n_buckets
* (value_size
+ sizeof(struct stack_map_bucket
));
88 if (cost
>= U32_MAX
- PAGE_SIZE
)
91 smap
->map
.map_type
= attr
->map_type
;
92 smap
->map
.key_size
= attr
->key_size
;
93 smap
->map
.value_size
= value_size
;
94 smap
->map
.max_entries
= attr
->max_entries
;
95 smap
->map
.map_flags
= attr
->map_flags
;
96 smap
->n_buckets
= n_buckets
;
97 smap
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
98 smap
->map
.numa_node
= bpf_map_attr_numa_node(attr
);
100 err
= bpf_map_precharge_memlock(smap
->map
.pages
);
104 err
= get_callchain_buffers(sysctl_perf_event_max_stack
);
108 err
= prealloc_elems_and_freelist(smap
);
115 put_callchain_buffers();
117 bpf_map_area_free(smap
);
121 BPF_CALL_3(bpf_get_stackid
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
124 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
125 struct perf_callchain_entry
*trace
;
126 struct stack_map_bucket
*bucket
, *new_bucket
, *old_bucket
;
127 u32 max_depth
= map
->value_size
/ 8;
128 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
129 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
130 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
131 u32 hash
, id
, trace_nr
, trace_len
;
132 bool user
= flags
& BPF_F_USER_STACK
;
136 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
137 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
140 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
141 sysctl_perf_event_max_stack
, false, false);
143 if (unlikely(!trace
))
144 /* couldn't fetch the stack trace */
147 /* get_perf_callchain() guarantees that trace->nr >= init_nr
148 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
150 trace_nr
= trace
->nr
- init_nr
;
152 if (trace_nr
<= skip
)
153 /* skipping more than usable stack trace */
157 trace_len
= trace_nr
* sizeof(u64
);
158 ips
= trace
->ip
+ skip
+ init_nr
;
159 hash
= jhash2((u32
*)ips
, trace_len
/ sizeof(u32
), 0);
160 id
= hash
& (smap
->n_buckets
- 1);
161 bucket
= READ_ONCE(smap
->buckets
[id
]);
163 if (bucket
&& bucket
->hash
== hash
) {
164 if (flags
& BPF_F_FAST_STACK_CMP
)
166 if (bucket
->nr
== trace_nr
&&
167 memcmp(bucket
->ip
, ips
, trace_len
) == 0)
171 /* this call stack is not in the map, try to add it */
172 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
))
175 new_bucket
= (struct stack_map_bucket
*)
176 pcpu_freelist_pop(&smap
->freelist
);
177 if (unlikely(!new_bucket
))
180 memcpy(new_bucket
->ip
, ips
, trace_len
);
181 new_bucket
->hash
= hash
;
182 new_bucket
->nr
= trace_nr
;
184 old_bucket
= xchg(&smap
->buckets
[id
], new_bucket
);
186 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
190 const struct bpf_func_proto bpf_get_stackid_proto
= {
191 .func
= bpf_get_stackid
,
193 .ret_type
= RET_INTEGER
,
194 .arg1_type
= ARG_PTR_TO_CTX
,
195 .arg2_type
= ARG_CONST_MAP_PTR
,
196 .arg3_type
= ARG_ANYTHING
,
199 /* Called from eBPF program */
200 static void *stack_map_lookup_elem(struct bpf_map
*map
, void *key
)
205 /* Called from syscall */
206 int bpf_stackmap_copy(struct bpf_map
*map
, void *key
, void *value
)
208 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
209 struct stack_map_bucket
*bucket
, *old_bucket
;
210 u32 id
= *(u32
*)key
, trace_len
;
212 if (unlikely(id
>= smap
->n_buckets
))
215 bucket
= xchg(&smap
->buckets
[id
], NULL
);
219 trace_len
= bucket
->nr
* sizeof(u64
);
220 memcpy(value
, bucket
->ip
, trace_len
);
221 memset(value
+ trace_len
, 0, map
->value_size
- trace_len
);
223 old_bucket
= xchg(&smap
->buckets
[id
], bucket
);
225 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
229 static int stack_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
234 static int stack_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
240 /* Called from syscall or from eBPF program */
241 static int stack_map_delete_elem(struct bpf_map
*map
, void *key
)
243 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
244 struct stack_map_bucket
*old_bucket
;
245 u32 id
= *(u32
*)key
;
247 if (unlikely(id
>= smap
->n_buckets
))
250 old_bucket
= xchg(&smap
->buckets
[id
], NULL
);
252 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
259 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
260 static void stack_map_free(struct bpf_map
*map
)
262 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
264 /* wait for bpf programs to complete before freeing stack map */
267 bpf_map_area_free(smap
->elems
);
268 pcpu_freelist_destroy(&smap
->freelist
);
269 bpf_map_area_free(smap
);
270 put_callchain_buffers();
273 const struct bpf_map_ops stack_map_ops
= {
274 .map_alloc
= stack_map_alloc
,
275 .map_free
= stack_map_free
,
276 .map_get_next_key
= stack_map_get_next_key
,
277 .map_lookup_elem
= stack_map_lookup_elem
,
278 .map_update_elem
= stack_map_update_elem
,
279 .map_delete_elem
= stack_map_delete_elem
,