gcc/
[official-gcc.git] / gcc-4_9-branch / gcc / config / arm / neon.ml
blobe16e2b063e901ab3ec0c6e073fdaa07767ee5ec9
1 (* Common code for ARM NEON header file, documentation and test case
2 generators.
4 Copyright (C) 2006-2014 Free Software Foundation, Inc.
5 Contributed by CodeSourcery.
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. *)
23 (* Shorthand types for vector elements. *)
24 type elts = S8 | S16 | S32 | S64 | F16 | F32 | U8 | U16 | U32 | U64 | P8 | P16
25 | P64 | P128 | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts
26 | Cast of elts * elts | NoElts
28 type eltclass = Signed | Unsigned | Float | Poly | Int | Bits
29 | ConvClass of eltclass * eltclass | NoType
31 (* These vector types correspond directly to C types. *)
32 type vectype = T_int8x8 | T_int8x16
33 | T_int16x4 | T_int16x8
34 | T_int32x2 | T_int32x4
35 | T_int64x1 | T_int64x2
36 | T_uint8x8 | T_uint8x16
37 | T_uint16x4 | T_uint16x8
38 | T_uint32x2 | T_uint32x4
39 | T_uint64x1 | T_uint64x2
40 | T_float16x4
41 | T_float32x2 | T_float32x4
42 | T_poly8x8 | T_poly8x16
43 | T_poly16x4 | T_poly16x8
44 | T_immediate of int * int
45 | T_int8 | T_int16
46 | T_int32 | T_int64
47 | T_uint8 | T_uint16
48 | T_uint32 | T_uint64
49 | T_poly8 | T_poly16
50 | T_poly64 | T_poly64x1
51 | T_poly64x2 | T_poly128
52 | T_float16 | T_float32
53 | T_arrayof of int * vectype
54 | T_ptrto of vectype | T_const of vectype
55 | T_void | T_intQI
56 | T_intHI | T_intSI
57 | T_intDI | T_intTI
58 | T_floatHF | T_floatSF
60 (* The meanings of the following are:
61 TImode : "Tetra", two registers (four words).
62 EImode : "hExa", three registers (six words).
63 OImode : "Octa", four registers (eight words).
64 CImode : "dodeCa", six registers (twelve words).
65 XImode : "heXadeca", eight registers (sixteen words).
68 type inttype = B_TImode | B_EImode | B_OImode | B_CImode | B_XImode
70 type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt
71 | PtrTo of shape_elt | CstPtrTo of shape_elt
72 (* These next ones are used only in the test generator. *)
73 | Element_of_dreg (* Used for "lane" variants. *)
74 | Element_of_qreg (* Likewise. *)
75 | All_elements_of_dreg (* Used for "dup" variants. *)
76 | Alternatives of shape_elt list (* Used for multiple valid operands *)
78 type shape_form = All of int * shape_elt
79 | Long
80 | Long_noreg of shape_elt
81 | Wide
82 | Wide_noreg of shape_elt
83 | Narrow
84 | Long_imm
85 | Narrow_imm
86 | Binary_imm of shape_elt
87 | Use_operands of shape_elt array
88 | By_scalar of shape_elt
89 | Unary_scalar of shape_elt
90 | Wide_lane
91 | Wide_scalar
92 | Pair_result of shape_elt
94 type arity = Arity0 of vectype
95 | Arity1 of vectype * vectype
96 | Arity2 of vectype * vectype * vectype
97 | Arity3 of vectype * vectype * vectype * vectype
98 | Arity4 of vectype * vectype * vectype * vectype * vectype
100 type vecmode = V8QI | V4HI | V4HF |V2SI | V2SF | DI
101 | V16QI | V8HI | V4SI | V4SF | V2DI | TI
102 | QI | HI | SI | SF
104 type opcode =
105 (* Binary ops. *)
106 Vadd
107 | Vmul
108 | Vmla
109 | Vmls
110 | Vfma
111 | Vfms
112 | Vsub
113 | Vceq
114 | Vcge
115 | Vcgt
116 | Vcle
117 | Vclt
118 | Vcage
119 | Vcagt
120 | Vcale
121 | Vcalt
122 | Vtst
123 | Vabd
124 | Vaba
125 | Vmax
126 | Vmin
127 | Vpadd
128 | Vpada
129 | Vpmax
130 | Vpmin
131 | Vrecps
132 | Vrsqrts
133 | Vshl
134 | Vshr_n
135 | Vshl_n
136 | Vsra_n
137 | Vsri
138 | Vsli
139 (* Logic binops. *)
140 | Vand
141 | Vorr
142 | Veor
143 | Vbic
144 | Vorn
145 | Vbsl
146 (* Ops with scalar. *)
147 | Vmul_lane
148 | Vmla_lane
149 | Vmls_lane
150 | Vmul_n
151 | Vmla_n
152 | Vmls_n
153 | Vmull_n
154 | Vmull_lane
155 | Vqdmull_n
156 | Vqdmull_lane
157 | Vqdmulh_n
158 | Vqdmulh_lane
159 (* Unary ops. *)
160 | Vrintn
161 | Vrinta
162 | Vrintp
163 | Vrintm
164 | Vrintz
165 | Vabs
166 | Vneg
167 | Vcls
168 | Vclz
169 | Vcnt
170 | Vrecpe
171 | Vrsqrte
172 | Vmvn
173 (* Vector extract. *)
174 | Vext
175 (* Reverse elements. *)
176 | Vrev64
177 | Vrev32
178 | Vrev16
179 (* Transposition ops. *)
180 | Vtrn
181 | Vzip
182 | Vuzp
183 (* Loads and stores (VLD1/VST1/VLD2...), elements and structures. *)
184 | Vldx of int
185 | Vstx of int
186 | Vldx_lane of int
187 | Vldx_dup of int
188 | Vstx_lane of int
189 (* Set/extract lanes from a vector. *)
190 | Vget_lane
191 | Vset_lane
192 (* Initialize vector from bit pattern. *)
193 | Vcreate
194 (* Set all lanes to same value. *)
195 | Vdup_n
196 | Vmov_n (* Is this the same? *)
197 (* Duplicate scalar to all lanes of vector. *)
198 | Vdup_lane
199 (* Combine vectors. *)
200 | Vcombine
201 (* Get quadword high/low parts. *)
202 | Vget_high
203 | Vget_low
204 (* Convert vectors. *)
205 | Vcvt
206 | Vcvt_n
207 (* Narrow/lengthen vectors. *)
208 | Vmovn
209 | Vmovl
210 (* Table lookup. *)
211 | Vtbl of int
212 | Vtbx of int
213 (* Reinterpret casts. *)
214 | Vreinterp
216 let rev_elems revsize elsize nelts _ =
217 let mask = (revsize / elsize) - 1 in
218 let arr = Array.init nelts
219 (fun i -> i lxor mask) in
220 Array.to_list arr
222 let permute_range i stride nelts increment =
223 let rec build i = function
224 0 -> []
225 | nelts -> i :: (i + stride) :: build (i + increment) (pred nelts) in
226 build i nelts
228 (* Generate a list of integers suitable for vzip. *)
229 let zip_range i stride nelts = permute_range i stride nelts 1
231 (* Generate a list of integers suitable for vunzip. *)
232 let uzip_range i stride nelts = permute_range i stride nelts 4
234 (* Generate a list of integers suitable for trn. *)
235 let trn_range i stride nelts = permute_range i stride nelts 2
237 let zip_elems _ nelts part =
238 match part with
239 `lo -> zip_range 0 nelts (nelts / 2)
240 | `hi -> zip_range (nelts / 2) nelts (nelts / 2)
242 let uzip_elems _ nelts part =
243 match part with
244 `lo -> uzip_range 0 2 (nelts / 2)
245 | `hi -> uzip_range 1 2 (nelts / 2)
247 let trn_elems _ nelts part =
248 match part with
249 `lo -> trn_range 0 nelts (nelts / 2)
250 | `hi -> trn_range 1 nelts (nelts / 2)
252 (* Features used for documentation, to distinguish between some instruction
253 variants, and to signal special requirements (e.g. swapping arguments). *)
255 type features =
256 Halving
257 | Rounding
258 | Saturating
259 | Dst_unsign
260 | High_half
261 | Doubling
262 | Flipped of string (* Builtin name to use with flipped arguments. *)
263 | InfoWord (* Pass an extra word for signage/rounding etc. (always passed
264 for All _, Long, Wide, Narrow shape_forms. *)
265 (* Implement builtin as shuffle. The parameter is a function which returns
266 masks suitable for __builtin_shuffle: arguments are (element size,
267 number of elements, high/low part selector). *)
268 | Use_shuffle of (int -> int -> [`lo|`hi] -> int list)
269 (* A specification as to the shape of instruction expected upon
270 disassembly, used if it differs from the shape used to build the
271 intrinsic prototype. Multiple entries in the constructor's argument
272 indicate that the intrinsic expands to more than one assembly
273 instruction, each with a corresponding shape specified here. *)
274 | Disassembles_as of shape_form list
275 | Builtin_name of string (* Override the name of the builtin. *)
276 (* Override the name of the instruction. If more than one name
277 is specified, it means that the instruction can have any of those
278 names. *)
279 | Instruction_name of string list
280 (* Mark that the intrinsic yields no instructions, or expands to yield
281 behavior that the test generator cannot test. *)
282 | No_op
283 (* Mark that the intrinsic has constant arguments that cannot be set
284 to the defaults (zero for pointers and one otherwise) in the test
285 cases. The function supplied must return the integer to be written
286 into the testcase for the argument number (0-based) supplied to it. *)
287 | Const_valuator of (int -> int)
288 | Fixed_vector_reg
289 | Fixed_core_reg
290 (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined. *)
291 | Requires_feature of string
292 (* Mark that the intrinsic requires a particular architecture version. *)
293 | Requires_arch of int
294 (* Mark that the intrinsic requires a particular bit in __ARM_FP to
295 be set. *)
296 | Requires_FP_bit of int
297 (* Compiler optimization level for the test. *)
298 | Compiler_optim of string
300 exception MixedMode of elts * elts
302 let rec elt_width = function
303 S8 | U8 | P8 | I8 | B8 -> 8
304 | S16 | U16 | P16 | I16 | B16 | F16 -> 16
305 | S32 | F32 | U32 | I32 | B32 -> 32
306 | S64 | U64 | P64 | I64 | B64 -> 64
307 | P128 -> 128
308 | Conv (a, b) ->
309 let wa = elt_width a and wb = elt_width b in
310 if wa = wb then wa else raise (MixedMode (a, b))
311 | Cast (a, b) -> raise (MixedMode (a, b))
312 | NoElts -> failwith "No elts"
314 let rec elt_class = function
315 S8 | S16 | S32 | S64 -> Signed
316 | U8 | U16 | U32 | U64 -> Unsigned
317 | P8 | P16 | P64 | P128 -> Poly
318 | F16 | F32 -> Float
319 | I8 | I16 | I32 | I64 -> Int
320 | B8 | B16 | B32 | B64 -> Bits
321 | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b)
322 | NoElts -> NoType
324 let elt_of_class_width c w =
325 match c, w with
326 Signed, 8 -> S8
327 | Signed, 16 -> S16
328 | Signed, 32 -> S32
329 | Signed, 64 -> S64
330 | Float, 16 -> F16
331 | Float, 32 -> F32
332 | Unsigned, 8 -> U8
333 | Unsigned, 16 -> U16
334 | Unsigned, 32 -> U32
335 | Unsigned, 64 -> U64
336 | Poly, 8 -> P8
337 | Poly, 16 -> P16
338 | Poly, 64 -> P64
339 | Poly, 128 -> P128
340 | Int, 8 -> I8
341 | Int, 16 -> I16
342 | Int, 32 -> I32
343 | Int, 64 -> I64
344 | Bits, 8 -> B8
345 | Bits, 16 -> B16
346 | Bits, 32 -> B32
347 | Bits, 64 -> B64
348 | _ -> failwith "Bad element type"
350 (* Return unsigned integer element the same width as argument. *)
351 let unsigned_of_elt elt =
352 elt_of_class_width Unsigned (elt_width elt)
354 let signed_of_elt elt =
355 elt_of_class_width Signed (elt_width elt)
357 (* Return untyped bits element the same width as argument. *)
358 let bits_of_elt elt =
359 elt_of_class_width Bits (elt_width elt)
361 let non_signed_variant = function
362 S8 -> I8
363 | S16 -> I16
364 | S32 -> I32
365 | S64 -> I64
366 | U8 -> I8
367 | U16 -> I16
368 | U32 -> I32
369 | U64 -> I64
370 | x -> x
372 let poly_unsigned_variant v =
373 let elclass = match elt_class v with
374 Poly -> Unsigned
375 | x -> x in
376 elt_of_class_width elclass (elt_width v)
378 let widen_elt elt =
379 let w = elt_width elt
380 and c = elt_class elt in
381 elt_of_class_width c (w * 2)
383 let narrow_elt elt =
384 let w = elt_width elt
385 and c = elt_class elt in
386 elt_of_class_width c (w / 2)
388 (* If we're trying to find a mode from a "Use_operands" instruction, use the
389 last vector operand as the dominant mode used to invoke the correct builtin.
390 We must stick to this rule in neon.md. *)
391 let find_key_operand operands =
392 let rec scan opno =
393 match operands.(opno) with
394 Qreg -> Qreg
395 | Dreg -> Dreg
396 | VecArray (_, Qreg) -> Qreg
397 | VecArray (_, Dreg) -> Dreg
398 | _ -> scan (opno-1)
400 scan ((Array.length operands) - 1)
402 (* Find a vecmode from a shape_elt ELT for an instruction with shape_form
403 SHAPE. For a Use_operands shape, if ARGPOS is passed then return the mode
404 for the given argument position, else determine which argument to return a
405 mode for automatically. *)
407 let rec mode_of_elt ?argpos elt shape =
408 let flt = match elt_class elt with
409 Float | ConvClass(_, Float) -> true | _ -> false in
410 let idx =
411 match elt_width elt with
412 8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3 | 128 -> 4
413 | _ -> failwith "Bad element width"
414 in match shape with
415 All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg
416 | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg ->
417 if flt then
418 [| V8QI; V4HF; V2SF; DI |].(idx)
419 else
420 [| V8QI; V4HI; V2SI; DI |].(idx)
421 | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg
422 | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg ->
423 [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI; TI|].(idx)
424 | All (_, (Corereg | PtrTo _ | CstPtrTo _)) ->
425 [| QI; HI; if flt then SF else SI; DI |].(idx)
426 | Long | Wide | Wide_lane | Wide_scalar
427 | Long_imm ->
428 [| V8QI; V4HI; V2SI; DI |].(idx)
429 | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx)
430 | Use_operands ops ->
431 begin match argpos with
432 None -> mode_of_elt ?argpos elt (All (0, (find_key_operand ops)))
433 | Some pos -> mode_of_elt ?argpos elt (All (0, ops.(pos)))
435 | _ -> failwith "invalid shape"
437 (* Modify an element type dependent on the shape of the instruction and the
438 operand number. *)
440 let shapemap shape no =
441 let ident = fun x -> x in
442 match shape with
443 All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _
444 | Binary_imm _ -> ident
445 | Long | Long_noreg _ | Wide_scalar | Long_imm ->
446 [| widen_elt; ident; ident |].(no)
447 | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no)
448 | Wide_lane -> [| widen_elt; ident; ident; ident |].(no)
449 | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no)
451 (* Register type (D/Q) of an operand, based on shape and operand number. *)
453 let regmap shape no =
454 match shape with
455 All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg
456 | Long -> [| Qreg; Dreg; Dreg |].(no)
457 | Wide -> [| Qreg; Qreg; Dreg |].(no)
458 | Narrow -> [| Dreg; Qreg; Qreg |].(no)
459 | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no)
460 | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no)
461 | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no)
462 | Unary_scalar reg -> [| reg; Dreg; Immed |].(no)
463 | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no)
464 | Binary_imm reg -> [| reg; reg; Immed |].(no)
465 | Long_imm -> [| Qreg; Dreg; Immed |].(no)
466 | Narrow_imm -> [| Dreg; Qreg; Immed |].(no)
467 | Use_operands these -> these.(no)
469 let type_for_elt shape elt no =
470 let elt = (shapemap shape no) elt in
471 let reg = regmap shape no in
472 let rec type_for_reg_elt reg elt =
473 match reg with
474 Dreg ->
475 begin match elt with
476 S8 -> T_int8x8
477 | S16 -> T_int16x4
478 | S32 -> T_int32x2
479 | S64 -> T_int64x1
480 | U8 -> T_uint8x8
481 | U16 -> T_uint16x4
482 | U32 -> T_uint32x2
483 | U64 -> T_uint64x1
484 | P64 -> T_poly64x1
485 | P128 -> T_poly128
486 | F16 -> T_float16x4
487 | F32 -> T_float32x2
488 | P8 -> T_poly8x8
489 | P16 -> T_poly16x4
490 | _ -> failwith "Bad elt type for Dreg"
492 | Qreg ->
493 begin match elt with
494 S8 -> T_int8x16
495 | S16 -> T_int16x8
496 | S32 -> T_int32x4
497 | S64 -> T_int64x2
498 | U8 -> T_uint8x16
499 | U16 -> T_uint16x8
500 | U32 -> T_uint32x4
501 | U64 -> T_uint64x2
502 | F32 -> T_float32x4
503 | P8 -> T_poly8x16
504 | P16 -> T_poly16x8
505 | P64 -> T_poly64x2
506 | P128 -> T_poly128
507 | _ -> failwith "Bad elt type for Qreg"
509 | Corereg ->
510 begin match elt with
511 S8 -> T_int8
512 | S16 -> T_int16
513 | S32 -> T_int32
514 | S64 -> T_int64
515 | U8 -> T_uint8
516 | U16 -> T_uint16
517 | U32 -> T_uint32
518 | U64 -> T_uint64
519 | P8 -> T_poly8
520 | P16 -> T_poly16
521 | P64 -> T_poly64
522 | P128 -> T_poly128
523 | F32 -> T_float32
524 | _ -> failwith "Bad elt type for Corereg"
526 | Immed ->
527 T_immediate (0, 0)
528 | VecArray (num, sub) ->
529 T_arrayof (num, type_for_reg_elt sub elt)
530 | PtrTo x ->
531 T_ptrto (type_for_reg_elt x elt)
532 | CstPtrTo x ->
533 T_ptrto (T_const (type_for_reg_elt x elt))
534 (* Anything else is solely for the use of the test generator. *)
535 | _ -> assert false
537 type_for_reg_elt reg elt
539 (* Return size of a vector type, in bits. *)
540 let vectype_size = function
541 T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1
542 | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1
543 | T_float32x2 | T_poly8x8 | T_poly64x1 | T_poly16x4 | T_float16x4 -> 64
544 | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2
545 | T_uint8x16 | T_uint16x8 | T_uint32x4 | T_uint64x2
546 | T_float32x4 | T_poly8x16 | T_poly64x2 | T_poly16x8 -> 128
547 | _ -> raise Not_found
549 let inttype_for_array num elttype =
550 let eltsize = vectype_size elttype in
551 let numwords = (num * eltsize) / 32 in
552 match numwords with
553 4 -> B_TImode
554 | 6 -> B_EImode
555 | 8 -> B_OImode
556 | 12 -> B_CImode
557 | 16 -> B_XImode
558 | _ -> failwith ("no int type for size " ^ string_of_int numwords)
560 (* These functions return pairs of (internal, external) types, where "internal"
561 types are those seen by GCC, and "external" are those seen by the assembler.
562 These types aren't necessarily the same, since the intrinsics can munge more
563 than one C type into each assembler opcode. *)
565 let make_sign_invariant func shape elt =
566 let arity, elt' = func shape elt in
567 arity, non_signed_variant elt'
569 (* Don't restrict any types. *)
571 let elts_same make_arity shape elt =
572 let vtype = type_for_elt shape elt in
573 make_arity vtype, elt
575 (* As sign_invar_*, but when sign matters. *)
576 let elts_same_io_lane =
577 elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3))
579 let elts_same_io =
580 elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2))
582 let elts_same_2_lane =
583 elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3))
585 let elts_same_3 = elts_same_2_lane
587 let elts_same_2 =
588 elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2))
590 let elts_same_1 =
591 elts_same (fun vtype -> Arity1 (vtype 0, vtype 1))
593 (* Use for signed/unsigned invariant operations (i.e. where the operation
594 doesn't depend on the sign of the data. *)
596 let sign_invar_io_lane = make_sign_invariant elts_same_io_lane
597 let sign_invar_io = make_sign_invariant elts_same_io
598 let sign_invar_2_lane = make_sign_invariant elts_same_2_lane
599 let sign_invar_2 = make_sign_invariant elts_same_2
600 let sign_invar_1 = make_sign_invariant elts_same_1
602 (* Sign-sensitive comparison. *)
604 let cmp_sign_matters shape elt =
605 let vtype = type_for_elt shape elt
606 and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
607 Arity2 (rtype, vtype 1, vtype 2), elt
609 (* Signed/unsigned invariant comparison. *)
611 let cmp_sign_invar shape elt =
612 let shape', elt' = cmp_sign_matters shape elt in
613 let elt'' =
614 match non_signed_variant elt' with
615 P8 -> I8
616 | x -> x
618 shape', elt''
620 (* Comparison (VTST) where only the element width matters. *)
622 let cmp_bits shape elt =
623 let vtype = type_for_elt shape elt
624 and rtype = type_for_elt shape (unsigned_of_elt elt) 0
625 and bits_only = bits_of_elt elt in
626 Arity2 (rtype, vtype 1, vtype 2), bits_only
628 let reg_shift shape elt =
629 let vtype = type_for_elt shape elt
630 and op2type = type_for_elt shape (signed_of_elt elt) 2 in
631 Arity2 (vtype 0, vtype 1, op2type), elt
633 (* Genericised constant-shift type-generating function. *)
635 let const_shift mkimm ?arity ?result shape elt =
636 let op2type = (shapemap shape 2) elt in
637 let op2width = elt_width op2type in
638 let op2 = mkimm op2width
639 and op1 = type_for_elt shape elt 1
640 and r_elt =
641 match result with
642 None -> elt
643 | Some restriction -> restriction elt in
644 let rtype = type_for_elt shape r_elt 0 in
645 match arity with
646 None -> Arity2 (rtype, op1, op2), elt
647 | Some mkarity -> mkarity rtype op1 op2, elt
649 (* Use for immediate right-shifts. *)
651 let shift_right shape elt =
652 const_shift (fun imm -> T_immediate (1, imm)) shape elt
654 let shift_right_acc shape elt =
655 const_shift (fun imm -> T_immediate (1, imm))
656 ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt
658 (* Use for immediate right-shifts when the operation doesn't care about
659 signedness. *)
661 let shift_right_sign_invar =
662 make_sign_invariant shift_right
664 (* Immediate right-shift; result is unsigned even when operand is signed. *)
666 let shift_right_to_uns shape elt =
667 const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt
668 shape elt
670 (* Immediate left-shift. *)
672 let shift_left shape elt =
673 const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt
675 (* Immediate left-shift, unsigned result. *)
677 let shift_left_to_uns shape elt =
678 const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt
679 shape elt
681 (* Immediate left-shift, don't care about signs. *)
683 let shift_left_sign_invar =
684 make_sign_invariant shift_left
686 (* Shift left/right and insert: only element size matters. *)
688 let shift_insert shape elt =
689 let arity, elt =
690 const_shift (fun imm -> T_immediate (1, imm))
691 ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in
692 arity, bits_of_elt elt
694 (* Get/set lane. *)
696 let get_lane shape elt =
697 let vtype = type_for_elt shape elt in
698 Arity2 (vtype 0, vtype 1, vtype 2),
699 (match elt with P8 -> U8 | P16 -> U16 | S32 | U32 | F32 -> B32 | x -> x)
701 let set_lane shape elt =
702 let vtype = type_for_elt shape elt in
703 Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
705 let set_lane_notype shape elt =
706 let vtype = type_for_elt shape elt in
707 Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts
709 let create_vector shape elt =
710 let vtype = type_for_elt shape U64 1
711 and rtype = type_for_elt shape elt 0 in
712 Arity1 (rtype, vtype), elt
714 let conv make_arity shape elt =
715 let edest, esrc = match elt with
716 Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc
717 | _ -> failwith "Non-conversion element in conversion" in
718 let vtype = type_for_elt shape esrc
719 and rtype = type_for_elt shape edest 0 in
720 make_arity rtype vtype, elt
722 let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1))
723 let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2))
725 (* Operation has an unsigned result even if operands are signed. *)
727 let dst_unsign make_arity shape elt =
728 let vtype = type_for_elt shape elt
729 and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
730 make_arity rtype vtype, elt
732 let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1))
734 let make_bits_only func shape elt =
735 let arity, elt' = func shape elt in
736 arity, bits_of_elt elt'
738 (* Extend operation. *)
740 let extend shape elt =
741 let vtype = type_for_elt shape elt in
742 Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
744 (* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned
745 integer ops respectively, or unsigned for polynomial ops. *)
747 let table mkarity shape elt =
748 let vtype = type_for_elt shape elt in
749 let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in
750 mkarity vtype op2, bits_of_elt elt
752 let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2))
753 let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2))
755 (* Operations where only bits matter. *)
757 let bits_1 = make_bits_only elts_same_1
758 let bits_2 = make_bits_only elts_same_2
759 let bits_3 = make_bits_only elts_same_3
761 (* Store insns. *)
762 let store_1 shape elt =
763 let vtype = type_for_elt shape elt in
764 Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt
766 let store_3 shape elt =
767 let vtype = type_for_elt shape elt in
768 Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt
770 let make_notype func shape elt =
771 let arity, _ = func shape elt in
772 arity, NoElts
774 let notype_1 = make_notype elts_same_1
775 let notype_2 = make_notype elts_same_2
776 let notype_3 = make_notype elts_same_3
778 (* Bit-select operations (first operand is unsigned int). *)
780 let bit_select shape elt =
781 let vtype = type_for_elt shape elt
782 and itype = type_for_elt shape (unsigned_of_elt elt) in
783 Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts
785 (* Common lists of supported element types. *)
787 let s_8_32 = [S8; S16; S32]
788 let u_8_32 = [U8; U16; U32]
789 let su_8_32 = [S8; S16; S32; U8; U16; U32]
790 let su_8_64 = S64 :: U64 :: su_8_32
791 let su_16_64 = [S16; S32; S64; U16; U32; U64]
792 let pf_su_8_16 = [P8; P16; S8; S16; U8; U16]
793 let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
794 let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
795 let suf_32 = [S32; U32; F32]
797 let ops =
799 (* Addition. *)
800 Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_32;
801 Vadd, [No_op], All (3, Dreg), "vadd", sign_invar_2, [S64; U64];
802 Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
803 Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
804 Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
805 Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32;
806 Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32;
807 Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
808 All (3, Dreg), "vRhadd", elts_same_2, su_8_32;
809 Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
810 All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32;
811 Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64;
812 Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64;
813 Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64;
814 Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half],
815 Narrow, "vRaddhn", sign_invar_2, su_16_64;
817 (* Multiplication. *)
818 Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32;
819 Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32;
820 Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh",
821 elts_same_2, [S16; S32];
822 Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ",
823 elts_same_2, [S16; S32];
824 Vmul,
825 [Saturating; Rounding; Doubling; High_half;
826 Instruction_name ["vqrdmulh"]],
827 All (3, Dreg), "vqRdmulh",
828 elts_same_2, [S16; S32];
829 Vmul,
830 [Saturating; Rounding; Doubling; High_half;
831 Instruction_name ["vqrdmulh"]],
832 All (3, Qreg), "vqRdmulhQ",
833 elts_same_2, [S16; S32];
834 Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32;
835 Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32];
837 (* Multiply-accumulate. *)
838 Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32;
839 Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32;
840 Vmla, [], Long, "vmlal", elts_same_io, su_8_32;
841 Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32];
843 (* Multiply-subtract. *)
844 Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32;
845 Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32;
846 Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
847 Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
849 (* Fused-multiply-accumulate. *)
850 Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
851 Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
852 Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
853 Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
855 (* Round to integral. *)
856 Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
857 "vrndn", elts_same_1, [F32];
858 Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
859 "vrndqn", elts_same_1, [F32];
860 Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
861 "vrnda", elts_same_1, [F32];
862 Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
863 "vrndqa", elts_same_1, [F32];
864 Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
865 "vrndp", elts_same_1, [F32];
866 Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
867 "vrndqp", elts_same_1, [F32];
868 Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
869 "vrndm", elts_same_1, [F32];
870 Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
871 "vrndqm", elts_same_1, [F32];
872 Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
873 "vrnd", elts_same_1, [F32];
874 Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
875 "vrndq", elts_same_1, [F32];
876 (* Subtraction. *)
877 Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
878 Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2, [S64; U64];
879 Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
880 Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
881 Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
882 Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32;
883 Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32;
884 Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64;
885 Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64;
886 Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64;
887 Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half],
888 Narrow, "vRsubhn", sign_invar_2, su_16_64;
890 (* Comparison, equal. *)
891 Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32;
892 Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32;
894 (* Comparison, greater-than or equal. *)
895 Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: s_8_32;
896 Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
897 All (3, Dreg), "vcge", cmp_sign_matters,
898 u_8_32;
899 Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: s_8_32;
900 Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
901 All (3, Qreg), "vcgeQ", cmp_sign_matters,
902 u_8_32;
904 (* Comparison, less-than or equal. *)
905 Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters,
906 F32 :: s_8_32;
907 Vcle, [Instruction_name ["vcge"]; Flipped "vcgeu"],
908 All (3, Dreg), "vcle", cmp_sign_matters,
909 u_8_32;
910 Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"],
911 All (3, Qreg), "vcleQ", cmp_sign_matters,
912 F32 :: s_8_32;
913 Vcle, [Instruction_name ["vcge"]; Flipped "vcgeuQ"],
914 All (3, Qreg), "vcleQ", cmp_sign_matters,
915 u_8_32;
917 (* Comparison, greater-than. *)
918 Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: s_8_32;
919 Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
920 All (3, Dreg), "vcgt", cmp_sign_matters,
921 u_8_32;
922 Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: s_8_32;
923 Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
924 All (3, Qreg), "vcgtQ", cmp_sign_matters,
925 u_8_32;
927 (* Comparison, less-than. *)
928 Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters,
929 F32 :: s_8_32;
930 Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtu"],
931 All (3, Dreg), "vclt", cmp_sign_matters,
932 u_8_32;
933 Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"],
934 All (3, Qreg), "vcltQ", cmp_sign_matters,
935 F32 :: s_8_32;
936 Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtuQ"],
937 All (3, Qreg), "vcltQ", cmp_sign_matters,
938 u_8_32;
940 (* Compare absolute greater-than or equal. *)
941 Vcage, [Instruction_name ["vacge"]],
942 All (3, Dreg), "vcage", cmp_sign_matters, [F32];
943 Vcage, [Instruction_name ["vacge"]],
944 All (3, Qreg), "vcageQ", cmp_sign_matters, [F32];
946 (* Compare absolute less-than or equal. *)
947 Vcale, [Instruction_name ["vacge"]; Flipped "vcage"],
948 All (3, Dreg), "vcale", cmp_sign_matters, [F32];
949 Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"],
950 All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32];
952 (* Compare absolute greater-than or equal. *)
953 Vcagt, [Instruction_name ["vacgt"]],
954 All (3, Dreg), "vcagt", cmp_sign_matters, [F32];
955 Vcagt, [Instruction_name ["vacgt"]],
956 All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32];
958 (* Compare absolute less-than or equal. *)
959 Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"],
960 All (3, Dreg), "vcalt", cmp_sign_matters, [F32];
961 Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"],
962 All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32];
964 (* Test bits. *)
965 Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32;
966 Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32;
968 (* Absolute difference. *)
969 Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32;
970 Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32;
971 Vabd, [], Long, "vabdl", elts_same_2, su_8_32;
973 (* Absolute difference and accumulate. *)
974 Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32;
975 Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32;
976 Vaba, [], Long, "vabal", elts_same_io, su_8_32;
978 (* Max. *)
979 Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32;
980 Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32;
982 (* Min. *)
983 Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32;
984 Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32;
986 (* Pairwise add. *)
987 Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32;
988 Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32;
989 Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32;
991 (* Pairwise add, widen and accumulate. *)
992 Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32;
993 Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32;
995 (* Folding maximum, minimum. *)
996 Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32;
997 Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32;
999 (* Reciprocal step. *)
1000 Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32];
1001 Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32];
1002 Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32];
1003 Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32];
1005 (* Vector shift left. *)
1006 Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64;
1007 Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64;
1008 Vshl, [Instruction_name ["vrshl"]; Rounding],
1009 All (3, Dreg), "vRshl", reg_shift, su_8_64;
1010 Vshl, [Instruction_name ["vrshl"]; Rounding],
1011 All (3, Qreg), "vRshlQ", reg_shift, su_8_64;
1012 Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64;
1013 Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64;
1014 Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
1015 All (3, Dreg), "vqRshl", reg_shift, su_8_64;
1016 Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
1017 All (3, Qreg), "vqRshlQ", reg_shift, su_8_64;
1019 (* Vector shift right by constant. *)
1020 Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64;
1021 Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64;
1022 Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg,
1023 "vRshr_n", shift_right, su_8_64;
1024 Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg,
1025 "vRshrQ_n", shift_right, su_8_64;
1026 Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64;
1027 Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n",
1028 shift_right_sign_invar, su_16_64;
1029 Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64;
1030 Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm,
1031 "vqRshrn_n", shift_right, su_16_64;
1032 Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n",
1033 shift_right_to_uns, [S16; S32; S64];
1034 Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding],
1035 Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64];
1037 (* Vector shift left by constant. *)
1038 Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64;
1039 Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64;
1040 Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64;
1041 Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64;
1042 Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n",
1043 shift_left_to_uns, [S8; S16; S32; S64];
1044 Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n",
1045 shift_left_to_uns, [S8; S16; S32; S64];
1046 Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32;
1048 (* Vector shift right by constant and accumulate. *)
1049 Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64;
1050 Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64;
1051 Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg,
1052 "vRsra_n", shift_right_acc, su_8_64;
1053 Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg,
1054 "vRsraQ_n", shift_right_acc, su_8_64;
1056 (* Vector shift right and insert. *)
1057 Vsri, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
1058 [P64];
1059 Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
1060 P8 :: P16 :: su_8_64;
1061 Vsri, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
1062 [P64];
1063 Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
1064 P8 :: P16 :: su_8_64;
1066 (* Vector shift left and insert. *)
1067 Vsli, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
1068 [P64];
1069 Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
1070 P8 :: P16 :: su_8_64;
1071 Vsli, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
1072 [P64];
1073 Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
1074 P8 :: P16 :: su_8_64;
1076 (* Absolute value. *)
1077 Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32];
1078 Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32];
1079 Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32];
1080 Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32];
1082 (* Negate. *)
1083 Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32];
1084 Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32];
1085 Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32];
1086 Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32];
1088 (* Bitwise not. *)
1089 Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32;
1090 Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32;
1092 (* Count leading sign bits. *)
1093 Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32];
1094 Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32];
1096 (* Count leading zeros. *)
1097 Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32;
1098 Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32;
1100 (* Count number of set bits. *)
1101 Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8];
1102 Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8];
1104 (* Reciprocal estimate. *)
1105 Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32];
1106 Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32];
1108 (* Reciprocal square-root estimate. *)
1109 Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32];
1110 Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32];
1112 (* Get lanes from a vector. *)
1113 Vget_lane,
1114 [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
1115 Instruction_name ["vmov"]],
1116 Use_operands [| Corereg; Dreg; Immed |],
1117 "vget_lane", get_lane, pf_su_8_32;
1118 Vget_lane,
1119 [No_op;
1120 InfoWord;
1121 Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
1122 Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1123 Use_operands [| Corereg; Dreg; Immed |],
1124 "vget_lane", notype_2, [S64; U64];
1125 Vget_lane,
1126 [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
1127 Instruction_name ["vmov"]],
1128 Use_operands [| Corereg; Qreg; Immed |],
1129 "vgetQ_lane", get_lane, pf_su_8_32;
1130 Vget_lane,
1131 [InfoWord;
1132 Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
1133 Instruction_name ["vmov"; "fmrrd"]; Const_valuator (fun _ -> 0);
1134 Fixed_core_reg],
1135 Use_operands [| Corereg; Qreg; Immed |],
1136 "vgetQ_lane", notype_2, [S64; U64];
1138 (* Set lanes in a vector. *)
1139 Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
1140 Instruction_name ["vmov"]],
1141 Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
1142 set_lane, pf_su_8_32;
1143 Vset_lane, [No_op;
1144 Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
1145 Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1146 Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
1147 set_lane_notype, [S64; U64];
1148 Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
1149 Instruction_name ["vmov"]],
1150 Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
1151 set_lane, pf_su_8_32;
1152 Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
1153 Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1154 Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
1155 set_lane_notype, [S64; U64];
1157 (* Create vector from literal bit pattern. *)
1158 Vcreate,
1159 [Requires_feature "CRYPTO"; No_op], (* Not really, but it can yield various things that are too
1160 hard for the test generator at this time. *)
1161 Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
1162 [P64];
1163 Vcreate,
1164 [No_op], (* Not really, but it can yield various things that are too
1165 hard for the test generator at this time. *)
1166 Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
1167 pf_su_8_64;
1169 (* Set all lanes to the same value. *)
1170 Vdup_n,
1171 [Disassembles_as [Use_operands [| Dreg;
1172 Alternatives [ Corereg;
1173 Element_of_dreg ] |]]],
1174 Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
1175 pf_su_8_32;
1176 Vdup_n,
1177 [No_op; Requires_feature "CRYPTO";
1178 Instruction_name ["vmov"];
1179 Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1180 Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
1181 [P64];
1182 Vdup_n,
1183 [No_op;
1184 Instruction_name ["vmov"];
1185 Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1186 Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
1187 [S64; U64];
1188 Vdup_n,
1189 [No_op; Requires_feature "CRYPTO";
1190 Disassembles_as [Use_operands [| Qreg;
1191 Alternatives [ Corereg;
1192 Element_of_dreg ] |]]],
1193 Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
1194 [P64];
1195 Vdup_n,
1196 [Disassembles_as [Use_operands [| Qreg;
1197 Alternatives [ Corereg;
1198 Element_of_dreg ] |]]],
1199 Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
1200 pf_su_8_32;
1201 Vdup_n,
1202 [No_op;
1203 Instruction_name ["vmov"];
1204 Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
1205 Use_operands [| Dreg; Corereg; Corereg |]]],
1206 Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1,
1207 [S64; U64];
1209 (* These are just aliases for the above. *)
1210 Vmov_n,
1211 [Builtin_name "vdup_n";
1212 Disassembles_as [Use_operands [| Dreg;
1213 Alternatives [ Corereg;
1214 Element_of_dreg ] |]]],
1215 Use_operands [| Dreg; Corereg |],
1216 "vmov_n", bits_1, pf_su_8_32;
1217 Vmov_n,
1218 [No_op;
1219 Builtin_name "vdup_n";
1220 Instruction_name ["vmov"];
1221 Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1222 Use_operands [| Dreg; Corereg |],
1223 "vmov_n", notype_1, [S64; U64];
1224 Vmov_n,
1225 [Builtin_name "vdupQ_n";
1226 Disassembles_as [Use_operands [| Qreg;
1227 Alternatives [ Corereg;
1228 Element_of_dreg ] |]]],
1229 Use_operands [| Qreg; Corereg |],
1230 "vmovQ_n", bits_1, pf_su_8_32;
1231 Vmov_n,
1232 [No_op;
1233 Builtin_name "vdupQ_n";
1234 Instruction_name ["vmov"];
1235 Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
1236 Use_operands [| Dreg; Corereg; Corereg |]]],
1237 Use_operands [| Qreg; Corereg |],
1238 "vmovQ_n", notype_1, [S64; U64];
1240 (* Duplicate, lane version. We can't use Use_operands here because the
1241 rightmost register (always Dreg) would be picked up by find_key_operand,
1242 when we want the leftmost register to be used in this case (otherwise
1243 the modes are indistinguishable in neon.md, etc. *)
1244 Vdup_lane,
1245 [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]],
1246 Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32;
1247 Vdup_lane,
1248 [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1249 Unary_scalar Dreg, "vdup_lane", bits_2, [P64];
1250 Vdup_lane,
1251 [No_op; Const_valuator (fun _ -> 0)],
1252 Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64];
1253 Vdup_lane,
1254 [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]],
1255 Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32;
1256 Vdup_lane,
1257 [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1258 Unary_scalar Qreg, "vdupQ_lane", bits_2, [P64];
1259 Vdup_lane,
1260 [No_op; Const_valuator (fun _ -> 0)],
1261 Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64];
1263 (* Combining vectors. *)
1264 Vcombine, [Requires_feature "CRYPTO"; No_op],
1265 Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
1266 [P64];
1267 Vcombine, [No_op],
1268 Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
1269 pf_su_8_64;
1271 (* Splitting vectors. *)
1272 Vget_high, [Requires_feature "CRYPTO"; No_op],
1273 Use_operands [| Dreg; Qreg |], "vget_high",
1274 notype_1, [P64];
1275 Vget_high, [No_op],
1276 Use_operands [| Dreg; Qreg |], "vget_high",
1277 notype_1, pf_su_8_64;
1278 Vget_low, [Instruction_name ["vmov"];
1279 Disassembles_as [Use_operands [| Dreg; Dreg |]];
1280 Fixed_vector_reg],
1281 Use_operands [| Dreg; Qreg |], "vget_low",
1282 notype_1, pf_su_8_32;
1283 Vget_low, [Requires_feature "CRYPTO"; No_op],
1284 Use_operands [| Dreg; Qreg |], "vget_low",
1285 notype_1, [P64];
1286 Vget_low, [No_op],
1287 Use_operands [| Dreg; Qreg |], "vget_low",
1288 notype_1, [S64; U64];
1290 (* Conversions. *)
1291 Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1,
1292 [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1293 Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1,
1294 [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1295 Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
1296 Use_operands [| Dreg; Qreg; |], "vcvt", conv_1, [Conv (F16, F32)];
1297 Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
1298 Use_operands [| Qreg; Dreg; |], "vcvt", conv_1, [Conv (F32, F16)];
1299 Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2,
1300 [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1301 Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2,
1302 [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1304 (* Move, narrowing. *)
1305 Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]],
1306 Narrow, "vmovn", sign_invar_1, su_16_64;
1307 Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating],
1308 Narrow, "vqmovn", elts_same_1, su_16_64;
1309 Vmovn,
1310 [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign],
1311 Narrow, "vqmovun", dst_unsign_1,
1312 [S16; S32; S64];
1314 (* Move, long. *)
1315 Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]],
1316 Long, "vmovl", elts_same_1, su_8_32;
1318 (* Table lookup. *)
1319 Vtbl 1,
1320 [Instruction_name ["vtbl"];
1321 Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
1322 Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8];
1323 Vtbl 2, [Instruction_name ["vtbl"]],
1324 Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2,
1325 [U8; S8; P8];
1326 Vtbl 3, [Instruction_name ["vtbl"]],
1327 Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2,
1328 [U8; S8; P8];
1329 Vtbl 4, [Instruction_name ["vtbl"]],
1330 Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2,
1331 [U8; S8; P8];
1333 (* Extended table lookup. *)
1334 Vtbx 1,
1335 [Instruction_name ["vtbx"];
1336 Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
1337 Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8];
1338 Vtbx 2, [Instruction_name ["vtbx"]],
1339 Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io,
1340 [U8; S8; P8];
1341 Vtbx 3, [Instruction_name ["vtbx"]],
1342 Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io,
1343 [U8; S8; P8];
1344 Vtbx 4, [Instruction_name ["vtbx"]],
1345 Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io,
1346 [U8; S8; P8];
1348 (* Multiply, lane. (note: these were undocumented at the time of
1349 writing). *)
1350 Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane,
1351 [S16; S32; U16; U32; F32];
1352 Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane,
1353 [S16; S32; U16; U32; F32];
1355 (* Multiply-accumulate, lane. *)
1356 Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane,
1357 [S16; S32; U16; U32; F32];
1358 Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane,
1359 [S16; S32; U16; U32; F32];
1360 Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane,
1361 [S16; S32; U16; U32];
1362 Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane",
1363 elts_same_io_lane, [S16; S32];
1365 (* Multiply-subtract, lane. *)
1366 Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane,
1367 [S16; S32; U16; U32; F32];
1368 Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane,
1369 [S16; S32; U16; U32; F32];
1370 Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane,
1371 [S16; S32; U16; U32];
1372 Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane",
1373 elts_same_io_lane, [S16; S32];
1375 (* Long multiply, lane. *)
1376 Vmull_lane, [],
1377 Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32];
1379 (* Saturating doubling long multiply, lane. *)
1380 Vqdmull_lane, [Saturating; Doubling],
1381 Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32];
1383 (* Saturating doubling long multiply high, lane. *)
1384 Vqdmulh_lane, [Saturating; Halving],
1385 By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32];
1386 Vqdmulh_lane, [Saturating; Halving],
1387 By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32];
1388 Vqdmulh_lane, [Saturating; Halving; Rounding;
1389 Instruction_name ["vqrdmulh"]],
1390 By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32];
1391 Vqdmulh_lane, [Saturating; Halving; Rounding;
1392 Instruction_name ["vqrdmulh"]],
1393 By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32];
1395 (* Vector multiply by scalar. *)
1396 Vmul_n, [InfoWord;
1397 Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1398 Use_operands [| Dreg; Dreg; Corereg |], "vmul_n",
1399 sign_invar_2, [S16; S32; U16; U32; F32];
1400 Vmul_n, [InfoWord;
1401 Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1402 Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n",
1403 sign_invar_2, [S16; S32; U16; U32; F32];
1405 (* Vector long multiply by scalar. *)
1406 Vmull_n, [Instruction_name ["vmull"];
1407 Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]],
1408 Wide_scalar, "vmull_n",
1409 elts_same_2, [S16; S32; U16; U32];
1411 (* Vector saturating doubling long multiply by scalar. *)
1412 Vqdmull_n, [Saturating; Doubling;
1413 Disassembles_as [Use_operands [| Qreg; Dreg;
1414 Element_of_dreg |]]],
1415 Wide_scalar, "vqdmull_n",
1416 elts_same_2, [S16; S32];
1418 (* Vector saturating doubling long multiply high by scalar. *)
1419 Vqdmulh_n,
1420 [Saturating; Halving; InfoWord;
1421 Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1422 Use_operands [| Qreg; Qreg; Corereg |],
1423 "vqdmulhQ_n", elts_same_2, [S16; S32];
1424 Vqdmulh_n,
1425 [Saturating; Halving; InfoWord;
1426 Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1427 Use_operands [| Dreg; Dreg; Corereg |],
1428 "vqdmulh_n", elts_same_2, [S16; S32];
1429 Vqdmulh_n,
1430 [Saturating; Halving; Rounding; InfoWord;
1431 Instruction_name ["vqrdmulh"];
1432 Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1433 Use_operands [| Qreg; Qreg; Corereg |],
1434 "vqRdmulhQ_n", elts_same_2, [S16; S32];
1435 Vqdmulh_n,
1436 [Saturating; Halving; Rounding; InfoWord;
1437 Instruction_name ["vqrdmulh"];
1438 Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1439 Use_operands [| Dreg; Dreg; Corereg |],
1440 "vqRdmulh_n", elts_same_2, [S16; S32];
1442 (* Vector multiply-accumulate by scalar. *)
1443 Vmla_n, [InfoWord;
1444 Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1445 Use_operands [| Dreg; Dreg; Corereg |], "vmla_n",
1446 sign_invar_io, [S16; S32; U16; U32; F32];
1447 Vmla_n, [InfoWord;
1448 Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1449 Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n",
1450 sign_invar_io, [S16; S32; U16; U32; F32];
1451 Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32];
1452 Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io,
1453 [S16; S32];
1455 (* Vector multiply subtract by scalar. *)
1456 Vmls_n, [InfoWord;
1457 Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1458 Use_operands [| Dreg; Dreg; Corereg |], "vmls_n",
1459 sign_invar_io, [S16; S32; U16; U32; F32];
1460 Vmls_n, [InfoWord;
1461 Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1462 Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n",
1463 sign_invar_io, [S16; S32; U16; U32; F32];
1464 Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32];
1465 Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io,
1466 [S16; S32];
1468 (* Vector extract. *)
1469 Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1470 Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
1471 [P64];
1472 Vext, [Const_valuator (fun _ -> 0)],
1473 Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
1474 pf_su_8_64;
1475 Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
1476 Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
1477 [P64];
1478 Vext, [Const_valuator (fun _ -> 0)],
1479 Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
1480 pf_su_8_64;
1482 (* Reverse elements. *)
1483 Vrev64, [Use_shuffle (rev_elems 64)], All (2, Dreg), "vrev64", bits_1,
1484 P8 :: P16 :: F32 :: su_8_32;
1485 Vrev64, [Use_shuffle (rev_elems 64)], All (2, Qreg), "vrev64Q", bits_1,
1486 P8 :: P16 :: F32 :: su_8_32;
1487 Vrev32, [Use_shuffle (rev_elems 32)], All (2, Dreg), "vrev32", bits_1,
1488 [P8; P16; S8; U8; S16; U16];
1489 Vrev32, [Use_shuffle (rev_elems 32)], All (2, Qreg), "vrev32Q", bits_1,
1490 [P8; P16; S8; U8; S16; U16];
1491 Vrev16, [Use_shuffle (rev_elems 16)], All (2, Dreg), "vrev16", bits_1,
1492 [P8; S8; U8];
1493 Vrev16, [Use_shuffle (rev_elems 16)], All (2, Qreg), "vrev16Q", bits_1,
1494 [P8; S8; U8];
1496 (* Bit selection. *)
1497 Vbsl,
1498 [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
1499 Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
1500 Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
1501 [P64];
1502 Vbsl,
1503 [Instruction_name ["vbsl"; "vbit"; "vbif"];
1504 Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
1505 Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
1506 pf_su_8_64;
1507 Vbsl,
1508 [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
1509 Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
1510 Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
1511 [P64];
1512 Vbsl,
1513 [Instruction_name ["vbsl"; "vbit"; "vbif"];
1514 Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
1515 Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
1516 pf_su_8_64;
1518 Vtrn, [Use_shuffle trn_elems], Pair_result Dreg, "vtrn", bits_2, pf_su_8_16;
1519 Vtrn, [Use_shuffle trn_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vtrn", bits_2, suf_32;
1520 Vtrn, [Use_shuffle trn_elems], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
1521 (* Zip elements. *)
1522 Vzip, [Use_shuffle zip_elems], Pair_result Dreg, "vzip", bits_2, pf_su_8_16;
1523 Vzip, [Use_shuffle zip_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vzip", bits_2, suf_32;
1524 Vzip, [Use_shuffle zip_elems], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32;
1526 (* Unzip elements. *)
1527 Vuzp, [Use_shuffle uzip_elems], Pair_result Dreg, "vuzp", bits_2,
1528 pf_su_8_32;
1529 Vuzp, [Use_shuffle uzip_elems], Pair_result Qreg, "vuzpQ", bits_2,
1530 pf_su_8_32;
1532 (* Element/structure loads. VLD1 variants. *)
1533 Vldx 1,
1534 [Requires_feature "CRYPTO";
1535 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1536 CstPtrTo Corereg |]]],
1537 Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
1538 [P64];
1539 Vldx 1,
1540 [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1541 CstPtrTo Corereg |]]],
1542 Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
1543 pf_su_8_64;
1544 Vldx 1, [Requires_feature "CRYPTO";
1545 Disassembles_as [Use_operands [| VecArray (2, Dreg);
1546 CstPtrTo Corereg |]]],
1547 Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
1548 [P64];
1549 Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1550 CstPtrTo Corereg |]]],
1551 Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
1552 pf_su_8_64;
1554 Vldx_lane 1,
1555 [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1556 CstPtrTo Corereg |]]],
1557 Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1558 "vld1_lane", bits_3, pf_su_8_32;
1559 Vldx_lane 1,
1560 [Requires_feature "CRYPTO";
1561 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1562 CstPtrTo Corereg |]];
1563 Const_valuator (fun _ -> 0)],
1564 Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1565 "vld1_lane", bits_3, [P64];
1566 Vldx_lane 1,
1567 [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1568 CstPtrTo Corereg |]];
1569 Const_valuator (fun _ -> 0)],
1570 Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1571 "vld1_lane", bits_3, [S64; U64];
1572 Vldx_lane 1,
1573 [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1574 CstPtrTo Corereg |]]],
1575 Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1576 "vld1Q_lane", bits_3, pf_su_8_32;
1577 Vldx_lane 1,
1578 [Requires_feature "CRYPTO";
1579 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1580 CstPtrTo Corereg |]]],
1581 Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1582 "vld1Q_lane", bits_3, [P64];
1583 Vldx_lane 1,
1584 [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1585 CstPtrTo Corereg |]]],
1586 Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1587 "vld1Q_lane", bits_3, [S64; U64];
1589 Vldx_dup 1,
1590 [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg);
1591 CstPtrTo Corereg |]]],
1592 Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1593 bits_1, pf_su_8_32;
1594 Vldx_dup 1,
1595 [Requires_feature "CRYPTO";
1596 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1597 CstPtrTo Corereg |]]],
1598 Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1599 bits_1, [P64];
1600 Vldx_dup 1,
1601 [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1602 CstPtrTo Corereg |]]],
1603 Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1604 bits_1, [S64; U64];
1605 Vldx_dup 1,
1606 [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg);
1607 CstPtrTo Corereg |]]],
1608 Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1609 bits_1, pf_su_8_32;
1610 (* Treated identically to vld1_dup above as we now
1611 do a single load followed by a duplicate. *)
1612 Vldx_dup 1,
1613 [Requires_feature "CRYPTO";
1614 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1615 CstPtrTo Corereg |]]],
1616 Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1617 bits_1, [P64];
1618 Vldx_dup 1,
1619 [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1620 CstPtrTo Corereg |]]],
1621 Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1622 bits_1, [S64; U64];
1624 (* VST1 variants. *)
1625 Vstx 1, [Requires_feature "CRYPTO";
1626 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1627 PtrTo Corereg |]]],
1628 Use_operands [| PtrTo Corereg; Dreg |], "vst1",
1629 store_1, [P64];
1630 Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1631 PtrTo Corereg |]]],
1632 Use_operands [| PtrTo Corereg; Dreg |], "vst1",
1633 store_1, pf_su_8_64;
1634 Vstx 1, [Requires_feature "CRYPTO";
1635 Disassembles_as [Use_operands [| VecArray (2, Dreg);
1636 PtrTo Corereg |]]],
1637 Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
1638 store_1, [P64];
1639 Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1640 PtrTo Corereg |]]],
1641 Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
1642 store_1, pf_su_8_64;
1644 Vstx_lane 1,
1645 [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1646 CstPtrTo Corereg |]]],
1647 Use_operands [| PtrTo Corereg; Dreg; Immed |],
1648 "vst1_lane", store_3, pf_su_8_32;
1649 Vstx_lane 1,
1650 [Requires_feature "CRYPTO";
1651 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1652 CstPtrTo Corereg |]];
1653 Const_valuator (fun _ -> 0)],
1654 Use_operands [| PtrTo Corereg; Dreg; Immed |],
1655 "vst1_lane", store_3, [P64];
1656 Vstx_lane 1,
1657 [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1658 CstPtrTo Corereg |]];
1659 Const_valuator (fun _ -> 0)],
1660 Use_operands [| PtrTo Corereg; Dreg; Immed |],
1661 "vst1_lane", store_3, [U64; S64];
1662 Vstx_lane 1,
1663 [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1664 CstPtrTo Corereg |]]],
1665 Use_operands [| PtrTo Corereg; Qreg; Immed |],
1666 "vst1Q_lane", store_3, pf_su_8_32;
1667 Vstx_lane 1,
1668 [Requires_feature "CRYPTO";
1669 Disassembles_as [Use_operands [| VecArray (1, Dreg);
1670 CstPtrTo Corereg |]]],
1671 Use_operands [| PtrTo Corereg; Qreg; Immed |],
1672 "vst1Q_lane", store_3, [P64];
1673 Vstx_lane 1,
1674 [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1675 CstPtrTo Corereg |]]],
1676 Use_operands [| PtrTo Corereg; Qreg; Immed |],
1677 "vst1Q_lane", store_3, [U64; S64];
1679 (* VLD2 variants. *)
1680 Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1681 "vld2", bits_1, pf_su_8_32;
1682 Vldx 2, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
1683 Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1684 "vld2", bits_1, [P64];
1685 Vldx 2, [Instruction_name ["vld1"]],
1686 Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1687 "vld2", bits_1, [S64; U64];
1688 Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1689 CstPtrTo Corereg |];
1690 Use_operands [| VecArray (2, Dreg);
1691 CstPtrTo Corereg |]]],
1692 Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |],
1693 "vld2Q", bits_1, pf_su_8_32;
1695 Vldx_lane 2,
1696 [Disassembles_as [Use_operands
1697 [| VecArray (2, Element_of_dreg);
1698 CstPtrTo Corereg |]]],
1699 Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg;
1700 VecArray (2, Dreg); Immed |],
1701 "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1702 Vldx_lane 2,
1703 [Disassembles_as [Use_operands
1704 [| VecArray (2, Element_of_dreg);
1705 CstPtrTo Corereg |]]],
1706 Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg;
1707 VecArray (2, Qreg); Immed |],
1708 "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1710 Vldx_dup 2,
1711 [Disassembles_as [Use_operands
1712 [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]],
1713 Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1714 "vld2_dup", bits_1, pf_su_8_32;
1715 Vldx_dup 2,
1716 [Requires_feature "CRYPTO";
1717 Instruction_name ["vld1"]; Disassembles_as [Use_operands
1718 [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
1719 Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1720 "vld2_dup", bits_1, [P64];
1721 Vldx_dup 2,
1722 [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1723 [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
1724 Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1725 "vld2_dup", bits_1, [S64; U64];
1727 (* VST2 variants. *)
1728 Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1729 PtrTo Corereg |]]],
1730 Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1731 store_1, pf_su_8_32;
1732 Vstx 2, [Requires_feature "CRYPTO";
1733 Disassembles_as [Use_operands [| VecArray (2, Dreg);
1734 PtrTo Corereg |]];
1735 Instruction_name ["vst1"]],
1736 Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1737 store_1, [P64];
1738 Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1739 PtrTo Corereg |]];
1740 Instruction_name ["vst1"]],
1741 Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1742 store_1, [S64; U64];
1743 Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1744 PtrTo Corereg |];
1745 Use_operands [| VecArray (2, Dreg);
1746 PtrTo Corereg |]]],
1747 Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q",
1748 store_1, pf_su_8_32;
1750 Vstx_lane 2,
1751 [Disassembles_as [Use_operands
1752 [| VecArray (2, Element_of_dreg);
1753 CstPtrTo Corereg |]]],
1754 Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane",
1755 store_3, P8 :: P16 :: F32 :: su_8_32;
1756 Vstx_lane 2,
1757 [Disassembles_as [Use_operands
1758 [| VecArray (2, Element_of_dreg);
1759 CstPtrTo Corereg |]]],
1760 Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane",
1761 store_3, [P16; F32; U16; U32; S16; S32];
1763 (* VLD3 variants. *)
1764 Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1765 "vld3", bits_1, pf_su_8_32;
1766 Vldx 3, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
1767 Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1768 "vld3", bits_1, [P64];
1769 Vldx 3, [Instruction_name ["vld1"]],
1770 Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1771 "vld3", bits_1, [S64; U64];
1772 Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
1773 CstPtrTo Corereg |];
1774 Use_operands [| VecArray (3, Dreg);
1775 CstPtrTo Corereg |]]],
1776 Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |],
1777 "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
1779 Vldx_lane 3,
1780 [Disassembles_as [Use_operands
1781 [| VecArray (3, Element_of_dreg);
1782 CstPtrTo Corereg |]]],
1783 Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg;
1784 VecArray (3, Dreg); Immed |],
1785 "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1786 Vldx_lane 3,
1787 [Disassembles_as [Use_operands
1788 [| VecArray (3, Element_of_dreg);
1789 CstPtrTo Corereg |]]],
1790 Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg;
1791 VecArray (3, Qreg); Immed |],
1792 "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1794 Vldx_dup 3,
1795 [Disassembles_as [Use_operands
1796 [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]],
1797 Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1798 "vld3_dup", bits_1, pf_su_8_32;
1799 Vldx_dup 3,
1800 [Requires_feature "CRYPTO";
1801 Instruction_name ["vld1"]; Disassembles_as [Use_operands
1802 [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
1803 Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1804 "vld3_dup", bits_1, [P64];
1805 Vldx_dup 3,
1806 [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1807 [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
1808 Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1809 "vld3_dup", bits_1, [S64; U64];
1811 (* VST3 variants. *)
1812 Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1813 PtrTo Corereg |]]],
1814 Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1815 store_1, pf_su_8_32;
1816 Vstx 3, [Requires_feature "CRYPTO";
1817 Disassembles_as [Use_operands [| VecArray (4, Dreg);
1818 PtrTo Corereg |]];
1819 Instruction_name ["vst1"]],
1820 Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1821 store_1, [P64];
1822 Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1823 PtrTo Corereg |]];
1824 Instruction_name ["vst1"]],
1825 Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1826 store_1, [S64; U64];
1827 Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
1828 PtrTo Corereg |];
1829 Use_operands [| VecArray (3, Dreg);
1830 PtrTo Corereg |]]],
1831 Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q",
1832 store_1, pf_su_8_32;
1834 Vstx_lane 3,
1835 [Disassembles_as [Use_operands
1836 [| VecArray (3, Element_of_dreg);
1837 CstPtrTo Corereg |]]],
1838 Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane",
1839 store_3, P8 :: P16 :: F32 :: su_8_32;
1840 Vstx_lane 3,
1841 [Disassembles_as [Use_operands
1842 [| VecArray (3, Element_of_dreg);
1843 CstPtrTo Corereg |]]],
1844 Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane",
1845 store_3, [P16; F32; U16; U32; S16; S32];
1847 (* VLD4/VST4 variants. *)
1848 Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1849 "vld4", bits_1, pf_su_8_32;
1850 Vldx 4, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
1851 Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1852 "vld4", bits_1, [P64];
1853 Vldx 4, [Instruction_name ["vld1"]],
1854 Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1855 "vld4", bits_1, [S64; U64];
1856 Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1857 CstPtrTo Corereg |];
1858 Use_operands [| VecArray (4, Dreg);
1859 CstPtrTo Corereg |]]],
1860 Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |],
1861 "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
1863 Vldx_lane 4,
1864 [Disassembles_as [Use_operands
1865 [| VecArray (4, Element_of_dreg);
1866 CstPtrTo Corereg |]]],
1867 Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg;
1868 VecArray (4, Dreg); Immed |],
1869 "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1870 Vldx_lane 4,
1871 [Disassembles_as [Use_operands
1872 [| VecArray (4, Element_of_dreg);
1873 CstPtrTo Corereg |]]],
1874 Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg;
1875 VecArray (4, Qreg); Immed |],
1876 "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1878 Vldx_dup 4,
1879 [Disassembles_as [Use_operands
1880 [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]],
1881 Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1882 "vld4_dup", bits_1, pf_su_8_32;
1883 Vldx_dup 4,
1884 [Requires_feature "CRYPTO";
1885 Instruction_name ["vld1"]; Disassembles_as [Use_operands
1886 [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
1887 Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1888 "vld4_dup", bits_1, [P64];
1889 Vldx_dup 4,
1890 [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1891 [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
1892 Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1893 "vld4_dup", bits_1, [S64; U64];
1895 Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1896 PtrTo Corereg |]]],
1897 Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1898 store_1, pf_su_8_32;
1899 Vstx 4, [Requires_feature "CRYPTO";
1900 Disassembles_as [Use_operands [| VecArray (4, Dreg);
1901 PtrTo Corereg |]];
1902 Instruction_name ["vst1"]],
1903 Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1904 store_1, [P64];
1905 Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1906 PtrTo Corereg |]];
1907 Instruction_name ["vst1"]],
1908 Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1909 store_1, [S64; U64];
1910 Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1911 PtrTo Corereg |];
1912 Use_operands [| VecArray (4, Dreg);
1913 PtrTo Corereg |]]],
1914 Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q",
1915 store_1, pf_su_8_32;
1917 Vstx_lane 4,
1918 [Disassembles_as [Use_operands
1919 [| VecArray (4, Element_of_dreg);
1920 CstPtrTo Corereg |]]],
1921 Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane",
1922 store_3, P8 :: P16 :: F32 :: su_8_32;
1923 Vstx_lane 4,
1924 [Disassembles_as [Use_operands
1925 [| VecArray (4, Element_of_dreg);
1926 CstPtrTo Corereg |]]],
1927 Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane",
1928 store_3, [P16; F32; U16; U32; S16; S32];
1930 (* Logical operations. And. *)
1931 Vand, [], All (3, Dreg), "vand", notype_2, su_8_32;
1932 Vand, [No_op], All (3, Dreg), "vand", notype_2, [S64; U64];
1933 Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64;
1935 (* Or. *)
1936 Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_32;
1937 Vorr, [No_op], All (3, Dreg), "vorr", notype_2, [S64; U64];
1938 Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64;
1940 (* Eor. *)
1941 Veor, [], All (3, Dreg), "veor", notype_2, su_8_32;
1942 Veor, [No_op], All (3, Dreg), "veor", notype_2, [S64; U64];
1943 Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64;
1945 (* Bic (And-not). *)
1946 Vbic, [Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, su_8_32;
1947 Vbic, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, [S64; U64];
1948 Vbic, [Compiler_optim "-O2"], All (3, Qreg), "vbicQ", notype_2, su_8_64;
1950 (* Or-not. *)
1951 Vorn, [Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, su_8_32;
1952 Vorn, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, [S64; U64];
1953 Vorn, [Compiler_optim "-O2"], All (3, Qreg), "vornQ", notype_2, su_8_64;
1956 let type_in_crypto_only t
1957 = (t == P64) || (t == P128)
1959 let cross_product s1 s2
1960 = List.filter (fun (e, e') -> e <> e')
1961 (List.concat (List.map (fun e1 -> List.map (fun e2 -> (e1,e2)) s1) s2))
1963 let reinterp =
1964 let elems = P8 :: P16 :: F32 :: P64 :: su_8_64 in
1965 let casts = cross_product elems elems in
1966 List.map
1967 (fun (convto, convfrom) ->
1968 Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
1969 then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Dreg; Dreg |],
1970 "vreinterpret", conv_1, [Cast (convto, convfrom)])
1971 casts
1973 let reinterpq =
1974 let elems = P8 :: P16 :: F32 :: P64 :: P128 :: su_8_64 in
1975 let casts = cross_product elems elems in
1976 List.map
1977 (fun (convto, convfrom) ->
1978 Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
1979 then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Qreg; Qreg |],
1980 "vreinterpretQ", conv_1, [Cast (convto, convfrom)])
1981 casts
1983 (* Output routines. *)
1985 let rec string_of_elt = function
1986 S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64"
1987 | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64"
1988 | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64"
1989 | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64"
1990 | F16 -> "f16" | F32 -> "f32" | P8 -> "p8" | P16 -> "p16"
1991 | P64 -> "p64" | P128 -> "p128"
1992 | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b
1993 | NoElts -> failwith "No elts"
1995 let string_of_elt_dots elt =
1996 match elt with
1997 Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b
1998 | _ -> string_of_elt elt
2000 let string_of_vectype vt =
2001 let rec name affix = function
2002 T_int8x8 -> affix "int8x8"
2003 | T_int8x16 -> affix "int8x16"
2004 | T_int16x4 -> affix "int16x4"
2005 | T_int16x8 -> affix "int16x8"
2006 | T_int32x2 -> affix "int32x2"
2007 | T_int32x4 -> affix "int32x4"
2008 | T_int64x1 -> affix "int64x1"
2009 | T_int64x2 -> affix "int64x2"
2010 | T_uint8x8 -> affix "uint8x8"
2011 | T_uint8x16 -> affix "uint8x16"
2012 | T_uint16x4 -> affix "uint16x4"
2013 | T_uint16x8 -> affix "uint16x8"
2014 | T_uint32x2 -> affix "uint32x2"
2015 | T_uint32x4 -> affix "uint32x4"
2016 | T_uint64x1 -> affix "uint64x1"
2017 | T_uint64x2 -> affix "uint64x2"
2018 | T_float16x4 -> affix "float16x4"
2019 | T_float32x2 -> affix "float32x2"
2020 | T_float32x4 -> affix "float32x4"
2021 | T_poly8x8 -> affix "poly8x8"
2022 | T_poly8x16 -> affix "poly8x16"
2023 | T_poly16x4 -> affix "poly16x4"
2024 | T_poly16x8 -> affix "poly16x8"
2025 | T_int8 -> affix "int8"
2026 | T_int16 -> affix "int16"
2027 | T_int32 -> affix "int32"
2028 | T_int64 -> affix "int64"
2029 | T_uint8 -> affix "uint8"
2030 | T_uint16 -> affix "uint16"
2031 | T_uint32 -> affix "uint32"
2032 | T_uint64 -> affix "uint64"
2033 | T_poly8 -> affix "poly8"
2034 | T_poly16 -> affix "poly16"
2035 | T_poly64 -> affix "poly64"
2036 | T_poly64x1 -> affix "poly64x1"
2037 | T_poly64x2 -> affix "poly64x2"
2038 | T_poly128 -> affix "poly128"
2039 | T_float16 -> affix "float16"
2040 | T_float32 -> affix "float32"
2041 | T_immediate _ -> "const int"
2042 | T_void -> "void"
2043 | T_intQI -> "__builtin_neon_qi"
2044 | T_intHI -> "__builtin_neon_hi"
2045 | T_intSI -> "__builtin_neon_si"
2046 | T_intDI -> "__builtin_neon_di"
2047 | T_intTI -> "__builtin_neon_ti"
2048 | T_floatHF -> "__builtin_neon_hf"
2049 | T_floatSF -> "__builtin_neon_sf"
2050 | T_arrayof (num, base) ->
2051 let basename = name (fun x -> x) base in
2052 affix (Printf.sprintf "%sx%d" basename num)
2053 | T_ptrto x ->
2054 let basename = name affix x in
2055 Printf.sprintf "%s *" basename
2056 | T_const x ->
2057 let basename = name affix x in
2058 Printf.sprintf "const %s" basename
2060 name (fun x -> x ^ "_t") vt
2062 let string_of_inttype = function
2063 B_TImode -> "__builtin_neon_ti"
2064 | B_EImode -> "__builtin_neon_ei"
2065 | B_OImode -> "__builtin_neon_oi"
2066 | B_CImode -> "__builtin_neon_ci"
2067 | B_XImode -> "__builtin_neon_xi"
2069 let string_of_mode = function
2070 V8QI -> "v8qi" | V4HI -> "v4hi" | V4HF -> "v4hf" | V2SI -> "v2si"
2071 | V2SF -> "v2sf" | DI -> "di" | V16QI -> "v16qi" | V8HI -> "v8hi"
2072 | V4SI -> "v4si" | V4SF -> "v4sf" | V2DI -> "v2di" | QI -> "qi"
2073 | HI -> "hi" | SI -> "si" | SF -> "sf" | TI -> "ti"
2075 (* Use uppercase chars for letters which form part of the intrinsic name, but
2076 should be omitted from the builtin name (the info is passed in an extra
2077 argument, instead). *)
2078 let intrinsic_name name = String.lowercase name
2080 (* Allow the name of the builtin to be overridden by things (e.g. Flipped)
2081 found in the features list. *)
2082 let builtin_name features name =
2083 let name = List.fold_right
2084 (fun el name ->
2085 match el with
2086 Flipped x | Builtin_name x -> x
2087 | _ -> name)
2088 features name in
2089 let islower x = let str = String.make 1 x in (String.lowercase str) = str
2090 and buf = Buffer.create (String.length name) in
2091 String.iter (fun c -> if islower c then Buffer.add_char buf c) name;
2092 Buffer.contents buf
2094 (* Transform an arity into a list of strings. *)
2095 let strings_of_arity a =
2096 match a with
2097 | Arity0 vt -> [string_of_vectype vt]
2098 | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2]
2099 | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1;
2100 string_of_vectype vt2;
2101 string_of_vectype vt3]
2102 | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1;
2103 string_of_vectype vt2;
2104 string_of_vectype vt3;
2105 string_of_vectype vt4]
2106 | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1;
2107 string_of_vectype vt2;
2108 string_of_vectype vt3;
2109 string_of_vectype vt4;
2110 string_of_vectype vt5]
2112 (* Suffixes on the end of builtin names that are to be stripped in order
2113 to obtain the name used as an instruction. They are only stripped if
2114 preceded immediately by an underscore. *)
2115 let suffixes_to_strip = [ "n"; "lane"; "dup" ]
2117 (* Get the possible names of an instruction corresponding to a "name" from the
2118 ops table. This is done by getting the equivalent builtin name and
2119 stripping any suffixes from the list at the top of this file, unless
2120 the features list presents with an Instruction_name entry, in which
2121 case that is used; or unless the features list presents with a Flipped
2122 entry, in which case that is used. If both such entries are present,
2123 the first in the list will be chosen. *)
2124 let get_insn_names features name =
2125 let names = try
2126 begin
2127 match List.find (fun feature -> match feature with
2128 Instruction_name _ -> true
2129 | Flipped _ -> true
2130 | _ -> false) features
2131 with
2132 Instruction_name names -> names
2133 | Flipped name -> [name]
2134 | _ -> assert false
2136 with Not_found -> [builtin_name features name]
2138 begin
2139 List.map (fun name' ->
2141 let underscore = String.rindex name' '_' in
2142 let our_suffix = String.sub name' (underscore + 1)
2143 ((String.length name') - underscore - 1)
2145 let rec strip remaining_suffixes =
2146 match remaining_suffixes with
2147 [] -> name'
2148 | s::ss when our_suffix = s -> String.sub name' 0 underscore
2149 | _::ss -> strip ss
2151 strip suffixes_to_strip
2152 with (Not_found | Invalid_argument _) -> name') names
2155 (* Apply a function to each element of a list and then comma-separate
2156 the resulting strings. *)
2157 let rec commas f elts acc =
2158 match elts with
2159 [] -> acc
2160 | [elt] -> acc ^ (f elt)
2161 | elt::elts ->
2162 commas f elts (acc ^ (f elt) ^ ", ")
2164 (* Given a list of features and the shape specified in the "ops" table, apply
2165 a function to each possible shape that the instruction may have.
2166 By default, this is the "shape" entry in "ops". If the features list
2167 contains a Disassembles_as entry, the shapes contained in that entry are
2168 mapped to corresponding outputs and returned in a list. If there is more
2169 than one Disassembles_as entry, only the first is used. *)
2170 let analyze_all_shapes features shape f =
2172 match List.find (fun feature ->
2173 match feature with Disassembles_as _ -> true
2174 | _ -> false)
2175 features with
2176 Disassembles_as shapes -> List.map f shapes
2177 | _ -> assert false
2178 with Not_found -> [f shape]
2180 (* The crypto intrinsics have unconventional shapes and are not that
2181 numerous to be worth the trouble of encoding here. We implement them
2182 explicitly here. *)
2183 let crypto_intrinsics =
2185 #ifdef __ARM_FEATURE_CRYPTO
2187 __extension__ static __inline poly128_t __attribute__ ((__always_inline__))
2188 vldrq_p128 (poly128_t const * __ptr)
2190 #ifdef __ARM_BIG_ENDIAN
2191 poly64_t* __ptmp = (poly64_t*) __ptr;
2192 poly64_t __d0 = vld1_p64 (__ptmp);
2193 poly64_t __d1 = vld1_p64 (__ptmp + 1);
2194 return vreinterpretq_p128_p64 (vcombine_p64 (__d1, __d0));
2195 #else
2196 return vreinterpretq_p128_p64 (vld1q_p64 ((poly64_t*) __ptr));
2197 #endif
2200 __extension__ static __inline void __attribute__ ((__always_inline__))
2201 vstrq_p128 (poly128_t * __ptr, poly128_t __val)
2203 #ifdef __ARM_BIG_ENDIAN
2204 poly64x2_t __tmp = vreinterpretq_p64_p128 (__val);
2205 poly64_t __d0 = vget_high_p64 (__tmp);
2206 poly64_t __d1 = vget_low_p64 (__tmp);
2207 vst1q_p64 ((poly64_t*) __ptr, vcombine_p64 (__d0, __d1));
2208 #else
2209 vst1q_p64 ((poly64_t*) __ptr, vreinterpretq_p64_p128 (__val));
2210 #endif
2213 /* The vceq_p64 intrinsic does not map to a single instruction.
2214 Instead we emulate it by performing a 32-bit variant of the vceq
2215 and applying a pairwise min reduction to the result.
2216 vceq_u32 will produce two 32-bit halves, each of which will contain either
2217 all ones or all zeros depending on whether the corresponding 32-bit
2218 halves of the poly64_t were equal. The whole poly64_t values are equal
2219 if and only if both halves are equal, i.e. vceq_u32 returns all ones.
2220 If the result is all zeroes for any half then the whole result is zeroes.
2221 This is what the pairwise min reduction achieves. */
2223 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
2224 vceq_p64 (poly64x1_t __a, poly64x1_t __b)
2226 uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
2227 uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
2228 uint32x2_t __c = vceq_u32 (__t_a, __t_b);
2229 uint32x2_t __m = vpmin_u32 (__c, __c);
2230 return vreinterpret_u64_u32 (__m);
2233 /* The vtst_p64 intrinsic does not map to a single instruction.
2234 We emulate it in way similar to vceq_p64 above but here we do
2235 a reduction with max since if any two corresponding bits
2236 in the two poly64_t's match, then the whole result must be all ones. */
2238 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
2239 vtst_p64 (poly64x1_t __a, poly64x1_t __b)
2241 uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
2242 uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
2243 uint32x2_t __c = vtst_u32 (__t_a, __t_b);
2244 uint32x2_t __m = vpmax_u32 (__c, __c);
2245 return vreinterpret_u64_u32 (__m);
2248 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2249 vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
2251 return __builtin_arm_crypto_aese (__data, __key);
2254 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2255 vaesdq_u8 (uint8x16_t __data, uint8x16_t __key)
2257 return __builtin_arm_crypto_aesd (__data, __key);
2260 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2261 vaesmcq_u8 (uint8x16_t __data)
2263 return __builtin_arm_crypto_aesmc (__data);
2266 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
2267 vaesimcq_u8 (uint8x16_t __data)
2269 return __builtin_arm_crypto_aesimc (__data);
2272 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
2273 vsha1h_u32 (uint32_t __hash_e)
2275 uint32x4_t __t = vdupq_n_u32 (0);
2276 __t = vsetq_lane_u32 (__hash_e, __t, 0);
2277 __t = __builtin_arm_crypto_sha1h (__t);
2278 return vgetq_lane_u32 (__t, 0);
2281 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2282 vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
2284 uint32x4_t __t = vdupq_n_u32 (0);
2285 __t = vsetq_lane_u32 (__hash_e, __t, 0);
2286 return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
2289 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2290 vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
2292 uint32x4_t __t = vdupq_n_u32 (0);
2293 __t = vsetq_lane_u32 (__hash_e, __t, 0);
2294 return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
2297 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2298 vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
2300 uint32x4_t __t = vdupq_n_u32 (0);
2301 __t = vsetq_lane_u32 (__hash_e, __t, 0);
2302 return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
2305 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2306 vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
2308 return __builtin_arm_crypto_sha1su0 (__w0_3, __w4_7, __w8_11);
2311 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2312 vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
2314 return __builtin_arm_crypto_sha1su1 (__tw0_3, __w12_15);
2317 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2318 vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
2320 return __builtin_arm_crypto_sha256h (__hash_abcd, __hash_efgh, __wk);
2323 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2324 vsha256h2q_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
2326 return __builtin_arm_crypto_sha256h2 (__hash_abcd, __hash_efgh, __wk);
2329 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2330 vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
2332 return __builtin_arm_crypto_sha256su0 (__w0_3, __w4_7);
2335 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
2336 vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
2338 return __builtin_arm_crypto_sha256su1 (__tw0_3, __w8_11, __w12_15);
2341 __extension__ static __inline poly128_t __attribute__ ((__always_inline__))
2342 vmull_p64 (poly64_t __a, poly64_t __b)
2344 return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __a, (uint64_t) __b);
2347 __extension__ static __inline poly128_t __attribute__ ((__always_inline__))
2348 vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
2350 poly64_t __t1 = vget_high_p64 (__a);
2351 poly64_t __t2 = vget_high_p64 (__b);
2353 return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __t1, (uint64_t) __t2);
2356 #endif