gcc/config/arm/neon.ml

   1 (* APPLE LOCAL file v7 support. Merge from Codesourcery *)
   2 (* Common code for ARM NEON header file, documentation and test case
   3    generators.
   4
   5    Copyright (C) 2006 Free Software Foundation, Inc.
   6    Contributed by CodeSourcery.
   7
   8    This file is part of GCC.
   9
  10    GCC is free software; you can redistribute it and/or modify it under
  11    the terms of the GNU General Public License as published by the Free
  12    Software Foundation; either version 2, or (at your option) any later
  13    version.
  14
  15    GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  17    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  18    for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with GCC; see the file COPYING.  If not, write to the Free
  22    Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.  *)
  24
  25 (* Shorthand types for vector elements.  *)
  26 type elts = S8 | S16 | S32 | S64 | F32 | U8 | U16 | U32 | U64 | P8 | P16
  27           | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts
  28           | Cast of elts * elts | NoElts
  29
  30 type eltclass = Signed | Unsigned | Float | Poly | Int | Bits
  31               | ConvClass of eltclass * eltclass | NoType
  32
  33 (* These vector types correspond directly to C types.  *)
  34 type vectype = T_int8x8    | T_int8x16
  35              | T_int16x4   | T_int16x8
  36              | T_int32x2   | T_int32x4
  37              | T_int64x1   | T_int64x2
  38              | T_uint8x8   | T_uint8x16
  39              | T_uint16x4  | T_uint16x8
  40              | T_uint32x2  | T_uint32x4
  41              | T_uint64x1  | T_uint64x2
  42              | T_float32x2 | T_float32x4
  43              | T_poly8x8   | T_poly8x16
  44              | T_poly16x4  | T_poly16x8
  45              | T_immediate of int * int
  46              | T_int8      | T_int16
  47              | T_int32     | T_int64
  48              | T_uint8     | T_uint16
  49              | T_uint32    | T_uint64
  50              | T_poly8     | T_poly16
  51              | T_float32   | T_arrayof of int * vectype
  52              | T_ptrto of vectype | T_const of vectype
  53              | T_void      | T_intQI
  54              | T_intHI     | T_intSI
  55              | T_intDI
  56
  57 (* The meanings of the following are:
  58      TImode : "Tetra", two registers (four words).
  59      EImode : "hExa", three registers (six words).
  60      OImode : "Octa", four registers (eight words).
  61      CImode : "dodeCa", six registers (twelve words).
  62      XImode : "heXadeca", eight registers (sixteen words).
  63 *)
  64
  65 (* LLVM LOCAL begin Use a different type for each vector type.  *)
  66 type inttype = B_TId8mode  | B_EId8mode  | B_OId8mode
  67              | B_TId16mode | B_EId16mode | B_OId16mode
  68              | B_TId32mode | B_EId32mode | B_OId32mode
  69              | B_TId64mode | B_EId64mode | B_OId64mode
  70              | B_TIdSFmode | B_EIdSFmode | B_OIdSFmode
  71              | B_OIq8mode  | B_CIq8mode  | B_XIq8mode
  72              | B_OIq16mode | B_CIq16mode | B_XIq16mode
  73              | B_OIq32mode | B_CIq32mode | B_XIq32mode
  74              | B_OIq64mode | B_CIq64mode | B_XIq64mode
  75              | B_OIqSFmode | B_CIqSFmode | B_XIqSFmode
  76 (* LLVM LOCAL end Use a different type for each vector type.  *)
  77
  78 type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt
  79                | PtrTo of shape_elt | CstPtrTo of shape_elt
  80                (* These next ones are used only in the test generator.  *)
  81                | Element_of_dreg        (* Used for "lane" variants.  *)
  82                | Element_of_qreg        (* Likewise.  *)
  83                | All_elements_of_dreg   (* Used for "dup" variants.  *)
  84
  85 type shape_form = All of int * shape_elt
  86                 | Long
  87                 | Long_noreg of shape_elt
  88                 | Wide
  89                 | Wide_noreg of shape_elt
  90                 | Narrow
  91                 | Long_imm
  92                 | Narrow_imm
  93                 | Binary_imm of shape_elt
  94                 | Use_operands of shape_elt array
  95                 | By_scalar of shape_elt
  96                 | Unary_scalar of shape_elt
  97                 | Wide_lane
  98                 | Wide_scalar
  99                 | Pair_result of shape_elt
 100
 101 type arity = Arity0 of vectype
 102            | Arity1 of vectype * vectype
 103            | Arity2 of vectype * vectype * vectype
 104            | Arity3 of vectype * vectype * vectype * vectype
 105            | Arity4 of vectype * vectype * vectype * vectype * vectype
 106
 107 (* LLVM LOCAL *)
 108 type vecmode = V8QI | V4HI | V2SI | V2SF | V1DI
 109              | V16QI | V8HI | V4SI | V4SF | V2DI
 110              | QI | HI | SI | SF | DI
 111
 112 type opcode =
 113   (* Binary ops.  *)
 114     Vadd
 115   | Vmul
 116   | Vmla
 117   | Vmls
 118   | Vsub
 119   | Vceq
 120   | Vcge
 121   | Vcgt
 122   | Vcle
 123   | Vclt
 124   | Vcage
 125   | Vcagt
 126   | Vcale
 127   | Vcalt
 128   | Vtst
 129   | Vabd
 130   | Vaba
 131   | Vmax
 132   | Vmin
 133   | Vpadd
 134   | Vpada
 135   | Vpmax
 136   | Vpmin
 137   | Vrecps
 138   | Vrsqrts
 139   | Vshl
 140   | Vshr_n
 141   | Vshl_n
 142   | Vsra_n
 143   | Vsri
 144   | Vsli
 145   (* Logic binops.  *)
 146   | Vand
 147   | Vorr
 148   | Veor
 149   | Vbic
 150   | Vorn
 151   | Vbsl
 152   (* Ops with scalar.  *)
 153   | Vmul_lane
 154   | Vmla_lane
 155   | Vmls_lane
 156   | Vmul_n
 157   | Vmla_n
 158   | Vmls_n
 159   | Vmull_n
 160   | Vmull_lane
 161   | Vqdmull_n
 162   | Vqdmull_lane
 163   | Vqdmulh_n
 164   | Vqdmulh_lane
 165   (* Unary ops.  *)
 166   | Vabs
 167   | Vneg
 168   | Vcls
 169   | Vclz
 170   | Vcnt
 171   | Vrecpe
 172   | Vrsqrte
 173   | Vmvn
 174   (* Vector extract.  *)
 175   | Vext
 176   (* Reverse elements.  *)
 177   | Vrev64
 178   | Vrev32
 179   | Vrev16
 180   (* Transposition ops.  *)
 181   | Vtrn
 182   | Vzip
 183   | Vuzp
 184   (* Loads and stores (VLD1/VST1/VLD2...), elements and structures.  *)
 185   | Vldx of int
 186   | Vstx of int
 187   | Vldx_lane of int
 188   | Vldx_dup of int
 189   | Vstx_lane of int
 190   (* Set/extract lanes from a vector.  *)
 191   | Vget_lane
 192   | Vset_lane
 193   (* Initialise vector from bit pattern.  *)
 194   | Vcreate
 195   (* Set all lanes to same value.  *)
 196   | Vdup_n
 197   | Vmov_n  (* Is this the same?  *)
 198   (* Duplicate scalar to all lanes of vector.  *)
 199   | Vdup_lane
 200   (* Combine vectors.  *)
 201   | Vcombine
 202   (* Get quadword high/low parts.  *)
 203   | Vget_high
 204   | Vget_low
 205   (* Convert vectors.  *)
 206   | Vcvt
 207   | Vcvt_n
 208   (* Narrow/lengthen vectors.  *)
 209   | Vmovn
 210   | Vmovl
 211   (* Table lookup.  *)
 212   | Vtbl of int
 213   | Vtbx of int
 214   (* Reinterpret casts.  *)
 215   | Vreinterp
 216
 217 (* Features used for documentation, to distinguish between some instruction
 218    variants, and to signal special requirements (e.g. swapping arguments).  *)
 219
 220 type features =
 221     Halving
 222   | Rounding
 223   | Saturating
 224   | Dst_unsign
 225   | High_half
 226   | Doubling
 227   | Flipped of string  (* Builtin name to use with flipped arguments.  *)
 228   | InfoWord  (* Pass an extra word for signage/rounding etc. (always passed
 229                  for All _, Long, Wide, Narrow shape_forms.  *)
 230   | ReturnPtr  (* Pass explicit pointer to return value as first argument.  *)
 231     (* A specification as to the shape of instruction expected upon
 232        disassembly, used if it differs from the shape used to build the
 233        intrinsic prototype.  Multiple entries in the constructor's argument
 234        indicate that the intrinsic expands to more than one assembly
 235        instruction, each with a corresponding shape specified here.  *)
 236   | Disassembles_as of shape_form list
 237   | Builtin_name of string  (* Override the name of the builtin.  *)
 238     (* Override the name of the instruction.  If more than one name
 239        is specified, it means that the instruction can have any of those
 240        names.  *)
 241   | Instruction_name of string list
 242     (* Mark that the intrinsic yields no instructions, or expands to yield
 243        behaviour that the test generator cannot test.  *)
 244   | No_op
 245     (* Mark that the intrinsic has constant arguments that cannot be set
 246        to the defaults (zero for pointers and one otherwise) in the test
 247        cases.  The function supplied must return the integer to be written
 248        into the testcase for the argument number (0-based) supplied to it.  *)
 249   | Const_valuator of (int -> int)
 250
 251 exception MixedMode of elts * elts
 252
 253 let rec elt_width = function
 254     S8 | U8 | P8 | I8 | B8 -> 8
 255   | S16 | U16 | P16 | I16 | B16 -> 16
 256   | S32 | F32 | U32 | I32 | B32 -> 32
 257   | S64 | U64 | I64 | B64 -> 64
 258   | Conv (a, b) ->
 259       let wa = elt_width a and wb = elt_width b in
 260       if wa = wb then wa else failwith "element width?"
 261   | Cast (a, b) -> raise (MixedMode (a, b))
 262   | NoElts -> failwith "No elts"
 263
 264 let rec elt_class = function
 265     S8 | S16 | S32 | S64 -> Signed
 266   | U8 | U16 | U32 | U64 -> Unsigned
 267   | P8 | P16 -> Poly
 268   | F32 -> Float
 269   | I8 | I16 | I32 | I64 -> Int
 270   | B8 | B16 | B32 | B64 -> Bits
 271   | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b)
 272   | NoElts -> NoType
 273
 274 let elt_of_class_width c w =
 275   match c, w with
 276     Signed, 8 -> S8
 277   | Signed, 16 -> S16
 278   | Signed, 32 -> S32
 279   | Signed, 64 -> S64
 280   | Float, 32 -> F32
 281   | Unsigned, 8 -> U8
 282   | Unsigned, 16 -> U16
 283   | Unsigned, 32 -> U32
 284   | Unsigned, 64 -> U64
 285   | Poly, 8 -> P8
 286   | Poly, 16 -> P16
 287   | Int, 8 -> I8
 288   | Int, 16 -> I16
 289   | Int, 32 -> I32
 290   | Int, 64 -> I64
 291   | Bits, 8 -> B8
 292   | Bits, 16 -> B16
 293   | Bits, 32 -> B32
 294   | Bits, 64 -> B64
 295   | _ -> failwith "Bad element type"
 296
 297 (* Return unsigned integer element the same width as argument.  *)
 298 let unsigned_of_elt elt =
 299   elt_of_class_width Unsigned (elt_width elt)
 300
 301 let signed_of_elt elt =
 302   elt_of_class_width Signed (elt_width elt)
 303
 304 (* Return untyped bits element the same width as argument.  *)
 305 let bits_of_elt elt =
 306   elt_of_class_width Bits (elt_width elt)
 307
 308 let non_signed_variant = function
 309     S8 -> I8
 310   | S16 -> I16
 311   | S32 -> I32
 312   | S64 -> I64
 313   | U8 -> I8
 314   | U16 -> I16
 315   | U32 -> I32
 316   | U64 -> I64
 317   | x -> x
 318
 319 let poly_unsigned_variant v =
 320   let elclass = match elt_class v with
 321     Poly -> Unsigned
 322   | x -> x in
 323   elt_of_class_width elclass (elt_width v)
 324
 325 let widen_elt elt =
 326   let w = elt_width elt
 327   and c = elt_class elt in
 328   elt_of_class_width c (w * 2)
 329
 330 let narrow_elt elt =
 331   let w = elt_width elt
 332   and c = elt_class elt in
 333   elt_of_class_width c (w / 2)
 334
 335 (* If we're trying to find a mode from a "Use_operands" instruction, use the
 336    last vector operand as the dominant mode used to invoke the correct builtin.
 337    We must stick to this rule in neon.md.  *)
 338 let find_key_operand operands =
 339   let rec scan opno =
 340     match operands.(opno) with
 341       Qreg -> Qreg
 342     | Dreg -> Dreg
 343     | VecArray (_, Qreg) -> Qreg
 344     | VecArray (_, Dreg) -> Dreg
 345     | _ -> scan (opno-1)
 346   in
 347     scan ((Array.length operands) - 1)
 348
 349 let rec mode_of_elt elt shape =
 350   let flt = match elt_class elt with
 351     Float | ConvClass(_, Float) -> true | _ -> false in
 352   let idx =
 353     match elt_width elt with
 354       8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3
 355     | _ -> failwith "Bad element width"
 356   in match shape with
 357     All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg
 358   | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg ->
 359 (* LLVM LOCAL *)
 360       [| V8QI; V4HI; if flt then V2SF else V2SI; V1DI |].(idx)
 361   | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg
 362   | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg ->
 363       [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI |].(idx)
 364   | All (_, (Corereg | PtrTo _ | CstPtrTo _)) ->
 365       [| QI; HI; if flt then SF else SI; DI |].(idx)
 366   | Long | Wide | Wide_lane | Wide_scalar
 367   | Long_imm ->
 368 (* LLVM LOCAL *)
 369       [| V8QI; V4HI; V2SI; V1DI |].(idx)
 370   | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx)
 371   | Use_operands ops -> mode_of_elt elt (All (0, (find_key_operand ops)))
 372   | _ -> failwith "invalid shape"
 373
 374 (* Modify an element type dependent on the shape of the instruction and the
 375    operand number.  *)
 376
 377 let shapemap shape no =
 378   let ident = fun x -> x in
 379   match shape with
 380     All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _
 381   | Binary_imm _ -> ident
 382   | Long | Long_noreg _ | Wide_scalar | Long_imm ->
 383       [| widen_elt; ident; ident |].(no)
 384   | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no)
 385   | Wide_lane -> [| widen_elt; ident; ident; ident |].(no)
 386   | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no)
 387
 388 (* Register type (D/Q) of an operand, based on shape and operand number.  *)
 389
 390 let regmap shape no =
 391   match shape with
 392     All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg
 393   | Long -> [| Qreg; Dreg; Dreg |].(no)
 394   | Wide -> [| Qreg; Qreg; Dreg |].(no)
 395   | Narrow -> [| Dreg; Qreg; Qreg |].(no)
 396   | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no)
 397   | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no)
 398   | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no)
 399   | Unary_scalar reg -> [| reg; Dreg; Immed |].(no)
 400   | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no)
 401   | Binary_imm reg -> [| reg; reg; Immed |].(no)
 402   | Long_imm -> [| Qreg; Dreg; Immed |].(no)
 403   | Narrow_imm -> [| Dreg; Qreg; Immed |].(no)
 404   | Use_operands these -> these.(no)
 405
 406 let type_for_elt shape elt no =
 407   let elt = (shapemap shape no) elt in
 408   let reg = regmap shape no in
 409   let rec type_for_reg_elt reg elt =
 410     match reg with
 411       Dreg ->
 412         begin match elt with
 413           S8 -> T_int8x8
 414         | S16 -> T_int16x4
 415         | S32 -> T_int32x2
 416         | S64 -> T_int64x1
 417         | U8 -> T_uint8x8
 418         | U16 -> T_uint16x4
 419         | U32 -> T_uint32x2
 420         | U64 -> T_uint64x1
 421         | F32 -> T_float32x2
 422         | P8 -> T_poly8x8
 423         | P16 -> T_poly16x4
 424         | _ -> failwith "Bad elt type"
 425         end
 426     | Qreg ->
 427         begin match elt with
 428           S8 -> T_int8x16
 429         | S16 -> T_int16x8
 430         | S32 -> T_int32x4
 431         | S64 -> T_int64x2
 432         | U8 -> T_uint8x16
 433         | U16 -> T_uint16x8
 434         | U32 -> T_uint32x4
 435         | U64 -> T_uint64x2
 436         | F32 -> T_float32x4
 437         | P8 -> T_poly8x16
 438         | P16 -> T_poly16x8
 439         | _ -> failwith "Bad elt type"
 440         end
 441     | Corereg ->
 442         begin match elt with
 443           S8 -> T_int8
 444         | S16 -> T_int16
 445         | S32 -> T_int32
 446         | S64 -> T_int64
 447         | U8 -> T_uint8
 448         | U16 -> T_uint16
 449         | U32 -> T_uint32
 450         | U64 -> T_uint64
 451         | P8 -> T_poly8
 452         | P16 -> T_poly16
 453         | F32 -> T_float32
 454         | _ -> failwith "Bad elt type"
 455         end
 456     | Immed ->
 457         T_immediate (0, 0)
 458     | VecArray (num, sub) ->
 459         T_arrayof (num, type_for_reg_elt sub elt)
 460     | PtrTo x ->
 461         T_ptrto (type_for_reg_elt x elt)
 462     | CstPtrTo x ->
 463         T_ptrto (T_const (type_for_reg_elt x elt))
 464     (* Anything else is solely for the use of the test generator.  *)
 465     | _ -> assert false
 466   in
 467     type_for_reg_elt reg elt
 468
 469 (* Return size of a vector type, in bits.  *)
 470 let vectype_size = function
 471     T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1
 472   | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1
 473   | T_float32x2 | T_poly8x8 | T_poly16x4 -> 64
 474   | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2
 475   | T_uint8x16 | T_uint16x8  | T_uint32x4  | T_uint64x2
 476   | T_float32x4 | T_poly8x16 | T_poly16x8 -> 128
 477   | _ -> raise Not_found
 478
 479 (* LLVM LOCAL begin Map vector types to modes.  *)
 480 let vectype_mode = function
 481     T_int8x8 | T_uint8x8 | T_poly8x8 -> V8QI
 482   | T_int8x16 | T_uint8x16 | T_poly8x16 -> V16QI
 483   | T_int16x4 | T_uint16x4 | T_poly16x4 -> V4HI
 484   | T_int16x8 | T_uint16x8 | T_poly16x8 -> V8HI
 485   | T_int32x2 | T_uint32x2 -> V2SI
 486   | T_int32x4 | T_uint32x4 -> V4SI
 487 (* LLVM LOCAL *)
 488   | T_int64x1 | T_uint64x1 -> V1DI
 489   | T_int64x2 | T_uint64x2 -> V2DI
 490   | T_float32x2 -> V2SF
 491   | T_float32x4 -> V4SF
 492   | _ -> raise Not_found
 493 (* LLVM LOCAL end Map vector types to modes.  *)
 494
 495 let inttype_for_array num elttype =
 496   let eltsize = vectype_size elttype in
 497   let numwords = (num * eltsize) / 32 in
 498   (* LLVM LOCAL begin Match vector type, too. *)
 499   let vecmode = vectype_mode elttype in
 500   match numwords, vecmode with
 501     4, V8QI -> B_TId8mode
 502   | 4, V4HI -> B_TId16mode
 503   | 4, V2SI -> B_TId32mode
 504 (* LLVM LOCAL *)
 505   | 4, V1DI -> B_TId64mode
 506   | 4, V2SF -> B_TIdSFmode
 507   | 6, V8QI -> B_EId8mode
 508   | 6, V4HI -> B_EId16mode
 509   | 6, V2SI -> B_EId32mode
 510 (* LLVM LOCAL *)
 511   | 6, V1DI -> B_EId64mode
 512   | 6, V2SF -> B_EIdSFmode
 513   | 8, V8QI -> B_OId8mode
 514   | 8, V4HI -> B_OId16mode
 515   | 8, V2SI -> B_OId32mode
 516 (* LLVM LOCAL *)
 517   | 8, V1DI -> B_OId64mode
 518   | 8, V2SF -> B_OIdSFmode
 519   | 8, V16QI -> B_OIq8mode
 520   | 8, V8HI -> B_OIq16mode
 521   | 8, V4SI -> B_OIq32mode
 522   | 8, V2DI -> B_OIq64mode
 523   | 8, V4SF -> B_OIqSFmode
 524   | 12, V16QI -> B_CIq8mode
 525   | 12, V8HI -> B_CIq16mode
 526   | 12, V4SI -> B_CIq32mode
 527   | 12, V2DI -> B_CIq64mode
 528   | 12, V4SF -> B_CIqSFmode
 529   | 16, V16QI -> B_XIq8mode
 530   | 16, V8HI -> B_XIq16mode
 531   | 16, V4SI -> B_XIq32mode
 532   | 16, V2DI -> B_XIq64mode
 533   | 16, V4SF -> B_XIqSFmode
 534   | _ -> failwith ("no int type for size " ^ string_of_int numwords)
 535   (* LLVM LOCAL end Match vector type, too. *)
 536
 537 (* These functions return pairs of (internal, external) types, where "internal"
 538    types are those seen by GCC, and "external" are those seen by the assembler.
 539    These types aren't necessarily the same, since the intrinsics can munge more
 540    than one C type into each assembler opcode.  *)
 541
 542 let make_sign_invariant func shape elt =
 543   let arity, elt' = func shape elt in
 544   arity, non_signed_variant elt'
 545
 546 (* Don't restrict any types.  *)
 547
 548 let elts_same make_arity shape elt =
 549   let vtype = type_for_elt shape elt in
 550   make_arity vtype, elt
 551
 552 (* As sign_invar_*, but when sign matters.  *)
 553 let elts_same_io_lane =
 554   elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3))
 555
 556 let elts_same_io =
 557   elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2))
 558
 559 let elts_same_2_lane =
 560   elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3))
 561
 562 let elts_same_3 = elts_same_2_lane
 563
 564 let elts_same_2 =
 565   elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2))
 566
 567 let elts_same_1 =
 568   elts_same (fun vtype -> Arity1 (vtype 0, vtype 1))
 569
 570 (* Use for signed/unsigned invariant operations (i.e. where the operation
 571    doesn't depend on the sign of the data.  *)
 572
 573 let sign_invar_io_lane = make_sign_invariant elts_same_io_lane
 574 let sign_invar_io = make_sign_invariant elts_same_io
 575 let sign_invar_2_lane = make_sign_invariant elts_same_2_lane
 576 let sign_invar_2 = make_sign_invariant elts_same_2
 577 let sign_invar_1 = make_sign_invariant elts_same_1
 578
 579 (* Sign-sensitive comparison.  *)
 580
 581 let cmp_sign_matters shape elt =
 582   let vtype = type_for_elt shape elt
 583   and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
 584   Arity2 (rtype, vtype 1, vtype 2), elt
 585
 586 (* Signed/unsigned invariant comparison.  *)
 587
 588 let cmp_sign_invar shape elt =
 589   let shape', elt' = cmp_sign_matters shape elt in
 590   let elt'' =
 591     match non_signed_variant elt' with
 592       P8 -> I8
 593     | x -> x
 594   in
 595     shape', elt''
 596
 597 (* Comparison (VTST) where only the element width matters.  *)
 598
 599 let cmp_bits shape elt =
 600   let vtype = type_for_elt shape elt
 601   and rtype = type_for_elt shape (unsigned_of_elt elt) 0
 602   and bits_only = bits_of_elt elt in
 603   Arity2 (rtype, vtype 1, vtype 2), bits_only
 604
 605 let reg_shift shape elt =
 606   let vtype = type_for_elt shape elt
 607   and op2type = type_for_elt shape (signed_of_elt elt) 2 in
 608   Arity2 (vtype 0, vtype 1, op2type), elt
 609
 610 (* Genericised constant-shift type-generating function.  *)
 611
 612 let const_shift mkimm ?arity ?result shape elt =
 613   let op2type = (shapemap shape 2) elt in
 614   let op2width = elt_width op2type in
 615   let op2 = mkimm op2width
 616   and op1 = type_for_elt shape elt 1
 617   and r_elt =
 618     match result with
 619       None -> elt
 620     | Some restriction -> restriction elt in
 621   let rtype = type_for_elt shape r_elt 0 in
 622   match arity with
 623     None -> Arity2 (rtype, op1, op2), elt
 624   | Some mkarity -> mkarity rtype op1 op2, elt
 625
 626 (* Use for immediate right-shifts.  *)
 627
 628 let shift_right shape elt =
 629   const_shift (fun imm -> T_immediate (1, imm)) shape elt
 630
 631 let shift_right_acc shape elt =
 632   const_shift (fun imm -> T_immediate (1, imm))
 633     ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt
 634
 635 (* Use for immediate right-shifts when the operation doesn't care about
 636    signedness.  *)
 637
 638 let shift_right_sign_invar =
 639   make_sign_invariant shift_right
 640
 641 (* Immediate right-shift; result is unsigned even when operand is signed.  *)
 642
 643 let shift_right_to_uns shape elt =
 644   const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt
 645     shape elt
 646
 647 (* Immediate left-shift.  *)
 648
 649 let shift_left shape elt =
 650   const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt
 651
 652 (* Immediate left-shift, unsigned result.  *)
 653
 654 let shift_left_to_uns shape elt =
 655   const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt
 656     shape elt
 657
 658 (* Immediate left-shift, don't care about signs.  *)
 659
 660 let shift_left_sign_invar =
 661   make_sign_invariant shift_left
 662
 663 (* Shift left/right and insert: only element size matters.  *)
 664
 665 let shift_insert shape elt =
 666   let arity, elt =
 667     const_shift (fun imm -> T_immediate (1, imm))
 668     ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in
 669   arity, bits_of_elt elt
 670
 671 (* Get/set lane.  *)
 672
 673 let get_lane shape elt =
 674   let vtype = type_for_elt shape elt in
 675   Arity2 (vtype 0, vtype 1, vtype 2),
 676     (match elt with P8 -> U8 | P16 -> U16 | x -> x)
 677
 678 let set_lane shape elt =
 679   let vtype = type_for_elt shape elt in
 680   Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
 681
 682 let set_lane_notype shape elt =
 683   let vtype = type_for_elt shape elt in
 684   Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts
 685
 686 let create_vector shape elt =
 687   let vtype = type_for_elt shape U64 1
 688   and rtype = type_for_elt shape elt 0 in
 689   Arity1 (rtype, vtype), elt
 690
 691 let conv make_arity shape elt =
 692   let edest, esrc = match elt with
 693     Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc
 694   | _ -> failwith "Non-conversion element in conversion" in
 695   let vtype = type_for_elt shape esrc
 696   and rtype = type_for_elt shape edest 0 in
 697   make_arity rtype vtype, elt
 698
 699 let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1))
 700 let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2))
 701
 702 (* Operation has an unsigned result even if operands are signed.  *)
 703
 704 let dst_unsign make_arity shape elt =
 705   let vtype = type_for_elt shape elt
 706   and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
 707   make_arity rtype vtype, elt
 708
 709 let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1))
 710
 711 let make_bits_only func shape elt =
 712   let arity, elt' = func shape elt in
 713   arity, bits_of_elt elt'
 714
 715 (* Extend operation.  *)
 716
 717 let extend shape elt =
 718   let vtype = type_for_elt shape elt in
 719   Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
 720
 721 (* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned
 722    integer ops respectively, or unsigned for polynomial ops.  *)
 723
 724 let table mkarity shape elt =
 725   let vtype = type_for_elt shape elt in
 726   let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in
 727   mkarity vtype op2, bits_of_elt elt
 728
 729 let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2))
 730 let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2))
 731
 732 (* Operations where only bits matter.  *)
 733
 734 let bits_1 = make_bits_only elts_same_1
 735 let bits_2 = make_bits_only elts_same_2
 736 let bits_3 = make_bits_only elts_same_3
 737
 738 (* Store insns.  *)
 739 let store_1 shape elt =
 740   let vtype = type_for_elt shape elt in
 741   Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt
 742
 743 let store_3 shape elt =
 744   let vtype = type_for_elt shape elt in
 745   Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt
 746
 747 let make_notype func shape elt =
 748   let arity, _ = func shape elt in
 749   arity, NoElts
 750
 751 let notype_1 = make_notype elts_same_1
 752 let notype_2 = make_notype elts_same_2
 753 let notype_3 = make_notype elts_same_3
 754
 755 (* Bit-select operations (first operand is unsigned int).  *)
 756
 757 let bit_select shape elt =
 758   let vtype = type_for_elt shape elt
 759   and itype = type_for_elt shape (unsigned_of_elt elt) in
 760   Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts
 761
 762 (* Common lists of supported element types.  *)
 763
 764 let su_8_32 = [S8; S16; S32; U8; U16; U32]
 765 let su_8_64 = S64 :: U64 :: su_8_32
 766 let su_16_64 = [S16; S32; S64; U16; U32; U64]
 767 let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
 768 let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
 769
 770 let ops =
 771   [
 772     (* Addition.  *)
 773     Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_64;
 774     Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
 775     Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
 776     Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
 777     Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32;
 778     Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32;
 779     Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
 780       All (3, Dreg), "vRhadd", elts_same_2, su_8_32;
 781     Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
 782       All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32;
 783     Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64;
 784     Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64;
 785     Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64;
 786     Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half],
 787       Narrow, "vRaddhn", sign_invar_2, su_16_64;
 788
 789     (* Multiplication.  *)
 790     Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32;
 791     Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32;
 792     Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh",
 793       elts_same_2, [S16; S32];
 794     Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ",
 795       elts_same_2, [S16; S32];
 796     Vmul,
 797       [Saturating; Rounding; Doubling; High_half;
 798        Instruction_name ["vqrdmulh"]],
 799       All (3, Dreg), "vqRdmulh",
 800       elts_same_2, [S16; S32];
 801     Vmul,
 802       [Saturating; Rounding; Doubling; High_half;
 803        Instruction_name ["vqrdmulh"]],
 804       All (3, Qreg), "vqRdmulhQ",
 805       elts_same_2, [S16; S32];
 806     Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32;
 807     Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32];
 808
 809     (* Multiply-accumulate. *)
 810     Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32;
 811     Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32;
 812     Vmla, [], Long, "vmlal", elts_same_io, su_8_32;
 813     Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32];
 814
 815     (* Multiply-subtract.  *)
 816     Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32;
 817     Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32;
 818     Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
 819     Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
 820
 821     (* Subtraction.  *)
 822     Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_64;
 823     Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
 824     Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
 825     Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
 826     Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32;
 827     Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32;
 828     Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64;
 829     Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64;
 830     Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64;
 831     Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half],
 832       Narrow, "vRsubhn", sign_invar_2, su_16_64;
 833
 834     (* Comparison, equal.  *)
 835     Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32;
 836     Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32;
 837
 838     (* Comparison, greater-than or equal.  *)
 839     Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: su_8_32;
 840     Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: su_8_32;
 841
 842     (* Comparison, less-than or equal.  *)
 843     Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters,
 844       F32 :: su_8_32;
 845     Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"],
 846       All (3, Qreg), "vcleQ", cmp_sign_matters,
 847       F32 :: su_8_32;
 848
 849     (* Comparison, greater-than.  *)
 850     Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: su_8_32;
 851     Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: su_8_32;
 852
 853     (* Comparison, less-than.  *)
 854     Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters,
 855       F32 :: su_8_32;
 856     Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"],
 857       All (3, Qreg), "vcltQ", cmp_sign_matters,
 858       F32 :: su_8_32;
 859
 860     (* Compare absolute greater-than or equal.  *)
 861     Vcage, [Instruction_name ["vacge"]],
 862       All (3, Dreg), "vcage", cmp_sign_matters, [F32];
 863     Vcage, [Instruction_name ["vacge"]],
 864       All (3, Qreg), "vcageQ", cmp_sign_matters, [F32];
 865
 866     (* Compare absolute less-than or equal.  *)
 867     Vcale, [Instruction_name ["vacge"]; Flipped "vcage"],
 868       All (3, Dreg), "vcale", cmp_sign_matters, [F32];
 869     Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"],
 870       All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32];
 871
 872     (* Compare absolute greater-than or equal.  *)
 873     Vcagt, [Instruction_name ["vacgt"]],
 874       All (3, Dreg), "vcagt", cmp_sign_matters, [F32];
 875     Vcagt, [Instruction_name ["vacgt"]],
 876       All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32];
 877
 878     (* Compare absolute less-than or equal.  *)
 879     Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"],
 880       All (3, Dreg), "vcalt", cmp_sign_matters, [F32];
 881     Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"],
 882       All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32];
 883
 884     (* Test bits.  *)
 885     Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32;
 886     Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32;
 887
 888     (* Absolute difference.  *)
 889     Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32;
 890     Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32;
 891     Vabd, [], Long, "vabdl", elts_same_2, su_8_32;
 892
 893     (* Absolute difference and accumulate.  *)
 894     Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32;
 895     Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32;
 896     Vaba, [], Long, "vabal", elts_same_io, su_8_32;
 897
 898     (* Max.  *)
 899     Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32;
 900     Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32;
 901
 902     (* Min.  *)
 903     Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32;
 904     Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32;
 905
 906     (* Pairwise add.  *)
 907     Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32;
 908     Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32;
 909     Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32;
 910
 911     (* Pairwise add, widen and accumulate.  *)
 912     Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32;
 913     Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32;
 914
 915     (* Folding maximum, minimum.  *)
 916     Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32;
 917     Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32;
 918
 919     (* Reciprocal step.  *)
 920     Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32];
 921     Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32];
 922     Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32];
 923     Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32];
 924
 925     (* Vector shift left.  *)
 926     Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64;
 927     Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64;
 928     Vshl, [Instruction_name ["vrshl"]; Rounding],
 929       All (3, Dreg), "vRshl", reg_shift, su_8_64;
 930     Vshl, [Instruction_name ["vrshl"]; Rounding],
 931       All (3, Qreg), "vRshlQ", reg_shift, su_8_64;
 932     Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64;
 933     Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64;
 934     Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
 935       All (3, Dreg), "vqRshl", reg_shift, su_8_64;
 936     Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
 937       All (3, Qreg), "vqRshlQ", reg_shift, su_8_64;
 938
 939     (* Vector shift right by constant.  *)
 940     Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64;
 941     Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64;
 942     Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg,
 943       "vRshr_n", shift_right, su_8_64;
 944     Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg,
 945       "vRshrQ_n", shift_right, su_8_64;
 946     Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64;
 947     Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n",
 948       shift_right_sign_invar, su_16_64;
 949     Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64;
 950     Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm,
 951       "vqRshrn_n", shift_right, su_16_64;
 952     Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n",
 953       shift_right_to_uns, [S16; S32; S64];
 954     Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding],
 955       Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64];
 956
 957     (* Vector shift left by constant.  *)
 958     Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64;
 959     Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64;
 960     Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64;
 961     Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64;
 962     Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n",
 963       shift_left_to_uns, [S8; S16; S32; S64];
 964     Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n",
 965       shift_left_to_uns, [S8; S16; S32; S64];
 966     Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32;
 967
 968     (* Vector shift right by constant and accumulate.  *)
 969     Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64;
 970     Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64;
 971     Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg,
 972       "vRsra_n", shift_right_acc, su_8_64;
 973     Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg,
 974       "vRsraQ_n", shift_right_acc, su_8_64;
 975
 976     (* Vector shift right and insert.  *)
 977     Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
 978       P8 :: P16 :: su_8_64;
 979     Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
 980       P8 :: P16 :: su_8_64;
 981
 982     (* Vector shift left and insert.  *)
 983     Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
 984       P8 :: P16 :: su_8_64;
 985     Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
 986       P8 :: P16 :: su_8_64;
 987
 988     (* Absolute value.  *)
 989     Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32];
 990     Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32];
 991     Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32];
 992     Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32];
 993
 994     (* Negate.  *)
 995     Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32];
 996     Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32];
 997     Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32];
 998     Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32];
 999
1000     (* Bitwise not.  *)
1001     Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32;
1002     Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32;
1003
1004     (* Count leading sign bits.  *)
1005     Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32];
1006     Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32];
1007
1008     (* Count leading zeros.  *)
1009     Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32;
1010     Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32;
1011
1012     (* Count number of set bits.  *)
1013     Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8];
1014     Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8];
1015
1016     (* Reciprocal estimate.  *)
1017     Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32];
1018     Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32];
1019
1020     (* Reciprocal square-root estimate.  *)
1021     Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32];
1022     Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32];
1023
1024     (* Get lanes from a vector.  *)
1025     Vget_lane,
1026       [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
1027        Instruction_name ["vmov"]],
1028       Use_operands [| Corereg; Dreg; Immed |],
1029       "vget_lane", get_lane, pf_su_8_32;
1030     Vget_lane,
1031       [InfoWord;
1032        Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
1033        Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1034       Use_operands [| Corereg; Dreg; Immed |],
1035       "vget_lane", notype_2, [S64; U64];
1036     Vget_lane,
1037       [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
1038        Instruction_name ["vmov"]],
1039       Use_operands [| Corereg; Qreg; Immed |],
1040       "vgetQ_lane", get_lane, pf_su_8_32;
1041     Vget_lane,
1042       [InfoWord;
1043        Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
1044        Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1045       Use_operands [| Corereg; Qreg; Immed |],
1046       "vgetQ_lane", notype_2, [S64; U64];
1047
1048     (* Set lanes in a vector.  *)
1049     Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
1050                 Instruction_name ["vmov"]],
1051       Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
1052       set_lane, pf_su_8_32;
1053     Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
1054                 Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1055       Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
1056       set_lane_notype, [S64; U64];
1057     Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
1058                 Instruction_name ["vmov"]],
1059       Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
1060       set_lane, pf_su_8_32;
1061     Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
1062                 Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
1063       Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
1064       set_lane_notype, [S64; U64];
1065
1066     (* Create vector from literal bit pattern.  *)
1067     Vcreate,
1068       [No_op], (* Not really, but it can yield various things that are too
1069                   hard for the test generator at this time.  *)
1070       Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
1071       pf_su_8_64;
1072
1073     (* Set all lanes to the same value.  *)
1074     Vdup_n, [],
1075       Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
1076       pf_su_8_32;
1077     Vdup_n,
1078       [Instruction_name ["vmov"];
1079        Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1080       Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
1081       [S64; U64];
1082     Vdup_n, [],
1083       Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
1084       pf_su_8_32;
1085     Vdup_n,
1086       [Instruction_name ["vmov"];
1087        Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
1088                         Use_operands [| Dreg; Corereg; Corereg |]]],
1089       Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1,
1090       [S64; U64];
1091
1092     (* These are just aliases for the above.  *)
1093     Vmov_n,
1094       [Builtin_name "vdup_n"],
1095       Use_operands [| Dreg; Corereg |],
1096       "vmov_n", bits_1, pf_su_8_32;
1097     Vmov_n,
1098       [Builtin_name "vdup_n";
1099        Instruction_name ["vmov"];
1100        Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
1101       Use_operands [| Dreg; Corereg |],
1102       "vmov_n", notype_1, [S64; U64];
1103     Vmov_n,
1104       [Builtin_name "vdupQ_n"],
1105       Use_operands [| Qreg; Corereg |],
1106       "vmovQ_n", bits_1, pf_su_8_32;
1107     Vmov_n,
1108       [Builtin_name "vdupQ_n";
1109        Instruction_name ["vmov"];
1110        Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
1111                         Use_operands [| Dreg; Corereg; Corereg |]]],
1112       Use_operands [| Qreg; Corereg |],
1113       "vmovQ_n", notype_1, [S64; U64];
1114
1115     (* Duplicate, lane version.  We can't use Use_operands here because the
1116        rightmost register (always Dreg) would be picked up by find_key_operand,
1117        when we want the leftmost register to be used in this case (otherwise
1118        the modes are indistinguishable in neon.md, etc.  *)
1119     Vdup_lane,
1120       [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]],
1121       Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32;
1122     Vdup_lane,
1123       [No_op; Const_valuator (fun _ -> 0)],
1124       Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64];
1125     Vdup_lane,
1126       [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]],
1127       Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32;
1128     Vdup_lane,
1129       [No_op; Const_valuator (fun _ -> 0)],
1130       Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64];
1131
1132     (* Combining vectors.  *)
1133     Vcombine, [No_op],
1134       Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
1135       pf_su_8_64;
1136
1137     (* Splitting vectors.  *)
1138     Vget_high, [No_op],
1139       Use_operands [| Dreg; Qreg |], "vget_high",
1140       notype_1, pf_su_8_64;
1141     Vget_low, [Instruction_name ["vmov"];
1142                Disassembles_as [Use_operands [| Dreg; Dreg |]]],
1143       Use_operands [| Dreg; Qreg |], "vget_low",
1144       notype_1, pf_su_8_64;
1145
1146     (* Conversions.  *)
1147     Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1,
1148       [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1149     Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1,
1150       [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1151     Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2,
1152       [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1153     Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2,
1154       [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
1155
1156     (* Move, narrowing.  *)
1157     Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]],
1158       Narrow, "vmovn", sign_invar_1, su_16_64;
1159     Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating],
1160       Narrow, "vqmovn", elts_same_1, su_16_64;
1161     Vmovn,
1162       [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign],
1163       Narrow, "vqmovun", dst_unsign_1,
1164       [S16; S32; S64];
1165
1166     (* Move, long.  *)
1167     Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]],
1168       Long, "vmovl", elts_same_1, su_8_32;
1169
1170     (* Table lookup.  *)
1171     Vtbl 1,
1172       [Instruction_name ["vtbl"];
1173        Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
1174       Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8];
1175     Vtbl 2, [Instruction_name ["vtbl"]],
1176       Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2,
1177       [U8; S8; P8];
1178     Vtbl 3, [Instruction_name ["vtbl"]],
1179       Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2,
1180       [U8; S8; P8];
1181     Vtbl 4, [Instruction_name ["vtbl"]],
1182       Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2,
1183       [U8; S8; P8];
1184
1185     (* Extended table lookup.  *)
1186     Vtbx 1,
1187       [Instruction_name ["vtbx"];
1188        Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
1189       Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8];
1190     Vtbx 2, [Instruction_name ["vtbx"]],
1191       Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io,
1192       [U8; S8; P8];
1193     Vtbx 3, [Instruction_name ["vtbx"]],
1194       Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io,
1195       [U8; S8; P8];
1196     Vtbx 4, [Instruction_name ["vtbx"]],
1197       Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io,
1198       [U8; S8; P8];
1199
1200     (* Multiply, lane.  (note: these were undocumented at the time of
1201        writing).  *)
1202     Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane,
1203       [S16; S32; U16; U32; F32];
1204     Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane,
1205       [S16; S32; U16; U32; F32];
1206
1207     (* Multiply-accumulate, lane.  *)
1208     Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane,
1209       [S16; S32; U16; U32; F32];
1210     Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane,
1211       [S16; S32; U16; U32; F32];
1212     Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane,
1213       [S16; S32; U16; U32];
1214     Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane",
1215       elts_same_io_lane, [S16; S32];
1216
1217     (* Multiply-subtract, lane.  *)
1218     Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane,
1219       [S16; S32; U16; U32; F32];
1220     Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane,
1221       [S16; S32; U16; U32; F32];
1222     Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane,
1223       [S16; S32; U16; U32];
1224     Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane",
1225       elts_same_io_lane, [S16; S32];
1226
1227     (* Long multiply, lane.  *)
1228     Vmull_lane, [],
1229       Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32];
1230
1231     (* Saturating doubling long multiply, lane.  *)
1232     Vqdmull_lane, [Saturating; Doubling],
1233       Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32];
1234
1235     (* Saturating doubling long multiply high, lane.  *)
1236     Vqdmulh_lane, [Saturating; Halving],
1237       By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32];
1238     Vqdmulh_lane, [Saturating; Halving],
1239       By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32];
1240     Vqdmulh_lane, [Saturating; Halving; Rounding;
1241                    Instruction_name ["vqrdmulh"]],
1242       By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32];
1243     Vqdmulh_lane, [Saturating; Halving; Rounding;
1244                    Instruction_name ["vqrdmulh"]],
1245       By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32];
1246
1247     (* Vector multiply by scalar.  *)
1248     Vmul_n, [InfoWord;
1249              Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1250              Use_operands [| Dreg; Dreg; Corereg |], "vmul_n",
1251       sign_invar_2, [S16; S32; U16; U32; F32];
1252     Vmul_n, [InfoWord;
1253              Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1254              Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n",
1255       sign_invar_2, [S16; S32; U16; U32; F32];
1256
1257     (* Vector long multiply by scalar.  *)
1258     Vmull_n, [Instruction_name ["vmull"];
1259               Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]],
1260               Wide_scalar, "vmull_n",
1261       elts_same_2, [S16; S32; U16; U32];
1262
1263     (* Vector saturating doubling long multiply by scalar.  *)
1264     Vqdmull_n, [Saturating; Doubling;
1265                 Disassembles_as [Use_operands [| Qreg; Dreg;
1266                                                  Element_of_dreg |]]],
1267                 Wide_scalar, "vqdmull_n",
1268       elts_same_2, [S16; S32];
1269
1270     (* Vector saturating doubling long multiply high by scalar.  *)
1271     Vqdmulh_n,
1272       [Saturating; Halving; InfoWord;
1273        Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1274       Use_operands [| Qreg; Qreg; Corereg |],
1275       "vqdmulhQ_n", elts_same_2, [S16; S32];
1276     Vqdmulh_n,
1277       [Saturating; Halving; InfoWord;
1278        Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1279       Use_operands [| Dreg; Dreg; Corereg |],
1280       "vqdmulh_n", elts_same_2, [S16; S32];
1281     Vqdmulh_n,
1282       [Saturating; Halving; Rounding; InfoWord;
1283        Instruction_name ["vqrdmulh"];
1284        Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1285       Use_operands [| Qreg; Qreg; Corereg |],
1286       "vqRdmulhQ_n", elts_same_2, [S16; S32];
1287     Vqdmulh_n,
1288       [Saturating; Halving; Rounding; InfoWord;
1289        Instruction_name ["vqrdmulh"];
1290        Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1291       Use_operands [| Dreg; Dreg; Corereg |],
1292       "vqRdmulh_n", elts_same_2, [S16; S32];
1293
1294     (* Vector multiply-accumulate by scalar.  *)
1295     Vmla_n, [InfoWord;
1296              Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1297       Use_operands [| Dreg; Dreg; Corereg |], "vmla_n",
1298       sign_invar_io, [S16; S32; U16; U32; F32];
1299     Vmla_n, [InfoWord;
1300              Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1301       Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n",
1302       sign_invar_io, [S16; S32; U16; U32; F32];
1303     Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32];
1304     Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io,
1305       [S16; S32];
1306
1307     (* Vector multiply subtract by scalar.  *)
1308     Vmls_n, [InfoWord;
1309              Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
1310       Use_operands [| Dreg; Dreg; Corereg |], "vmls_n",
1311       sign_invar_io, [S16; S32; U16; U32; F32];
1312     Vmls_n, [InfoWord;
1313              Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
1314       Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n",
1315       sign_invar_io, [S16; S32; U16; U32; F32];
1316     Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32];
1317     Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io,
1318       [S16; S32];
1319
1320     (* Vector extract.  *)
1321     Vext, [Const_valuator (fun _ -> 0)],
1322       Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
1323       pf_su_8_64;
1324     Vext, [Const_valuator (fun _ -> 0)],
1325       Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
1326       pf_su_8_64;
1327
1328     (* Reverse elements.  *)
1329     Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32;
1330     Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
1331     Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16];
1332     Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16];
1333     Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8];
1334     Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8];
1335
1336     (* Bit selection.  *)
1337     Vbsl,
1338       [Instruction_name ["vbsl"; "vbit"; "vbif"];
1339        Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
1340       Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
1341       pf_su_8_64;
1342     Vbsl,
1343       [Instruction_name ["vbsl"; "vbit"; "vbif"];
1344        Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
1345       Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
1346       pf_su_8_64;
1347
1348     (* Transpose elements.  **NOTE** ReturnPtr goes some of the way towards
1349        generating good code for intrinsics which return structure types --
1350        builtins work well by themselves (and understand that the values being
1351        stored on e.g. the stack also reside in registers, so can optimise the
1352        stores away entirely if the results are used immediately), but
1353        intrinsics are very much less efficient. Maybe something can be improved
1354        re: inlining, or tweaking the ABI used for intrinsics (a special call
1355        attribute?).
1356     *)
1357     (* LLVM LOCAL begin Use return by value instead of ReturnPtr. *)
1358     Vtrn, [], Use_operands [| VecArray (2, Dreg); Dreg; Dreg |],
1359       "vtrn", bits_2, pf_su_8_32;
1360     Vtrn, [], Use_operands [| VecArray (2, Qreg); Qreg; Qreg |],
1361       "vtrnQ", bits_2, pf_su_8_32;
1362
1363     (* Zip elements.  *)
1364     Vzip, [], Use_operands [| VecArray (2, Dreg); Dreg; Dreg |],
1365       "vzip", bits_2, pf_su_8_32;
1366     Vzip, [], Use_operands [| VecArray (2, Qreg); Qreg; Qreg |],
1367       "vzipQ", bits_2, pf_su_8_32;
1368
1369     (* Unzip elements.  *)
1370     Vuzp, [], Use_operands [| VecArray (2, Dreg); Dreg; Dreg |],
1371       "vuzp", bits_2, pf_su_8_32;
1372     Vuzp, [], Use_operands [| VecArray (2, Qreg); Qreg; Qreg |],
1373       "vuzpQ", bits_2, pf_su_8_32;
1374     (* LLVM LOCAL end Use return by value instead of ReturnPtr. *)
1375
1376     (* Element/structure loads.  VLD1 variants.  *)
1377     Vldx 1,
1378       [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1379                                         CstPtrTo Corereg |]]],
1380       Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
1381       pf_su_8_64;
1382     Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1383                                               CstPtrTo Corereg |]]],
1384       Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
1385       pf_su_8_64;
1386
1387     Vldx_lane 1,
1388       [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1389                                         CstPtrTo Corereg |]]],
1390       Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1391       "vld1_lane", bits_3, pf_su_8_32;
1392     Vldx_lane 1,
1393       [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1394                                         CstPtrTo Corereg |]];
1395        Const_valuator (fun _ -> 0)],
1396       Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
1397       "vld1_lane", bits_3, [S64; U64];
1398     Vldx_lane 1,
1399       [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1400                                         CstPtrTo Corereg |]]],
1401       Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1402       "vld1Q_lane", bits_3, pf_su_8_32;
1403     Vldx_lane 1,
1404       [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1405                                         CstPtrTo Corereg |]]],
1406       Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
1407       "vld1Q_lane", bits_3, [S64; U64];
1408
1409     Vldx_dup 1,
1410       [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg);
1411                                         CstPtrTo Corereg |]]],
1412       Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1413       bits_1, pf_su_8_32;
1414     Vldx_dup 1,
1415       [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1416                                         CstPtrTo Corereg |]]],
1417       Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
1418       bits_1, [S64; U64];
1419     Vldx_dup 1,
1420       [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg);
1421                                         CstPtrTo Corereg |]]],
1422       Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1423       bits_1, pf_su_8_32;
1424     Vldx_dup 1,
1425       [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1426                                         CstPtrTo Corereg |]]],
1427       Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
1428       bits_1, [S64; U64];
1429
1430     (* VST1 variants.  *)
1431     Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1432                                               PtrTo Corereg |]]],
1433       Use_operands [| PtrTo Corereg; Dreg |], "vst1",
1434       store_1, pf_su_8_64;
1435     Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1436                                               PtrTo Corereg |]]],
1437       Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
1438       store_1, pf_su_8_64;
1439
1440     Vstx_lane 1,
1441       [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1442                                         CstPtrTo Corereg |]]],
1443       Use_operands [| PtrTo Corereg; Dreg; Immed |],
1444       "vst1_lane", store_3, pf_su_8_32;
1445     Vstx_lane 1,
1446       [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1447                                         CstPtrTo Corereg |]];
1448        Const_valuator (fun _ -> 0)],
1449       Use_operands [| PtrTo Corereg; Dreg; Immed |],
1450       "vst1_lane", store_3, [U64; S64];
1451     Vstx_lane 1,
1452       [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
1453                                         CstPtrTo Corereg |]]],
1454       Use_operands [| PtrTo Corereg; Qreg; Immed |],
1455       "vst1Q_lane", store_3, pf_su_8_32;
1456     Vstx_lane 1,
1457       [Disassembles_as [Use_operands [| VecArray (1, Dreg);
1458                                         CstPtrTo Corereg |]]],
1459       Use_operands [| PtrTo Corereg; Qreg; Immed |],
1460       "vst1Q_lane", store_3, [U64; S64];
1461
1462     (* VLD2 variants.  *)
1463     Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1464       "vld2", bits_1, pf_su_8_32;
1465     Vldx 2, [Instruction_name ["vld1"]],
1466        Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1467       "vld2", bits_1, [S64; U64];
1468     Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1469                                               CstPtrTo Corereg |];
1470                               Use_operands [| VecArray (2, Dreg);
1471                                               CstPtrTo Corereg |]]],
1472       Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |],
1473       "vld2Q", bits_1, pf_su_8_32;
1474
1475     Vldx_lane 2,
1476       [Disassembles_as [Use_operands
1477         [| VecArray (2, Element_of_dreg);
1478            CstPtrTo Corereg |]]],
1479       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg;
1480                       VecArray (2, Dreg); Immed |],
1481       "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1482     Vldx_lane 2,
1483       [Disassembles_as [Use_operands
1484         [| VecArray (2, Element_of_dreg);
1485            CstPtrTo Corereg |]]],
1486       Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg;
1487                       VecArray (2, Qreg); Immed |],
1488       "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1489
1490     Vldx_dup 2,
1491       [Disassembles_as [Use_operands
1492         [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]],
1493       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1494       "vld2_dup", bits_1, pf_su_8_32;
1495     Vldx_dup 2,
1496       [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1497         [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
1498       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
1499       "vld2_dup", bits_1, [S64; U64];
1500
1501     (* VST2 variants.  *)
1502     Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1503                                               PtrTo Corereg |]]],
1504       Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1505       store_1, pf_su_8_32;
1506     Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1507                                               PtrTo Corereg |]];
1508              Instruction_name ["vst1"]],
1509       Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
1510       store_1, [S64; U64];
1511     Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
1512                                               PtrTo Corereg |];
1513                               Use_operands [| VecArray (2, Dreg);
1514                                               PtrTo Corereg |]]],
1515       Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q",
1516       store_1, pf_su_8_32;
1517
1518     Vstx_lane 2,
1519       [Disassembles_as [Use_operands
1520         [| VecArray (2, Element_of_dreg);
1521            CstPtrTo Corereg |]]],
1522       Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane",
1523       store_3, P8 :: P16 :: F32 :: su_8_32;
1524     Vstx_lane 2,
1525       [Disassembles_as [Use_operands
1526         [| VecArray (2, Element_of_dreg);
1527            CstPtrTo Corereg |]]],
1528       Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane",
1529       store_3, [P16; F32; U16; U32; S16; S32];
1530
1531     (* VLD3 variants.  *)
1532     Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1533       "vld3", bits_1, pf_su_8_32;
1534     Vldx 3, [Instruction_name ["vld1"]],
1535       Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1536       "vld3", bits_1, [S64; U64];
1537     Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
1538                                               CstPtrTo Corereg |];
1539                               Use_operands [| VecArray (3, Dreg);
1540                                               CstPtrTo Corereg |]]],
1541       Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |],
1542       "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
1543
1544     Vldx_lane 3,
1545       [Disassembles_as [Use_operands
1546         [| VecArray (3, Element_of_dreg);
1547            CstPtrTo Corereg |]]],
1548       Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg;
1549                                      VecArray (3, Dreg); Immed |],
1550       "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1551     Vldx_lane 3,
1552       [Disassembles_as [Use_operands
1553         [| VecArray (3, Element_of_dreg);
1554            CstPtrTo Corereg |]]],
1555       Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg;
1556                                      VecArray (3, Qreg); Immed |],
1557       "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1558
1559     Vldx_dup 3,
1560       [Disassembles_as [Use_operands
1561         [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]],
1562       Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1563       "vld3_dup", bits_1, pf_su_8_32;
1564     Vldx_dup 3,
1565       [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1566         [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
1567       Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
1568       "vld3_dup", bits_1, [S64; U64];
1569
1570     (* VST3 variants.  *)
1571     Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1572                                               PtrTo Corereg |]]],
1573       Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1574       store_1, pf_su_8_32;
1575     Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1576                                               PtrTo Corereg |]];
1577              Instruction_name ["vst1"]],
1578       Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
1579       store_1, [S64; U64];
1580     Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
1581                                               PtrTo Corereg |];
1582                               Use_operands [| VecArray (3, Dreg);
1583                                               PtrTo Corereg |]]],
1584       Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q",
1585       store_1, pf_su_8_32;
1586
1587     Vstx_lane 3,
1588       [Disassembles_as [Use_operands
1589         [| VecArray (3, Element_of_dreg);
1590            CstPtrTo Corereg |]]],
1591       Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane",
1592       store_3, P8 :: P16 :: F32 :: su_8_32;
1593     Vstx_lane 3,
1594       [Disassembles_as [Use_operands
1595         [| VecArray (3, Element_of_dreg);
1596            CstPtrTo Corereg |]]],
1597       Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane",
1598       store_3, [P16; F32; U16; U32; S16; S32];
1599
1600     (* VLD4/VST4 variants.  *)
1601     Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1602       "vld4", bits_1, pf_su_8_32;
1603     Vldx 4, [Instruction_name ["vld1"]],
1604       Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1605       "vld4", bits_1, [S64; U64];
1606     Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1607                                               CstPtrTo Corereg |];
1608                               Use_operands [| VecArray (4, Dreg);
1609                                               CstPtrTo Corereg |]]],
1610       Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |],
1611       "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
1612
1613     Vldx_lane 4,
1614       [Disassembles_as [Use_operands
1615         [| VecArray (4, Element_of_dreg);
1616            CstPtrTo Corereg |]]],
1617       Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg;
1618                                      VecArray (4, Dreg); Immed |],
1619       "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
1620     Vldx_lane 4,
1621       [Disassembles_as [Use_operands
1622         [| VecArray (4, Element_of_dreg);
1623            CstPtrTo Corereg |]]],
1624       Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg;
1625                       VecArray (4, Qreg); Immed |],
1626       "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
1627
1628     Vldx_dup 4,
1629       [Disassembles_as [Use_operands
1630         [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]],
1631       Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1632       "vld4_dup", bits_1, pf_su_8_32;
1633     Vldx_dup 4,
1634       [Instruction_name ["vld1"]; Disassembles_as [Use_operands
1635         [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
1636       Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
1637       "vld4_dup", bits_1, [S64; U64];
1638
1639     Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1640                                               PtrTo Corereg |]]],
1641       Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1642       store_1, pf_su_8_32;
1643     Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1644                                               PtrTo Corereg |]];
1645              Instruction_name ["vst1"]],
1646       Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
1647       store_1, [S64; U64];
1648     Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
1649                                               PtrTo Corereg |];
1650                               Use_operands [| VecArray (4, Dreg);
1651                                               PtrTo Corereg |]]],
1652      Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q",
1653       store_1, pf_su_8_32;
1654
1655     Vstx_lane 4,
1656       [Disassembles_as [Use_operands
1657         [| VecArray (4, Element_of_dreg);
1658            CstPtrTo Corereg |]]],
1659       Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane",
1660       store_3, P8 :: P16 :: F32 :: su_8_32;
1661     Vstx_lane 4,
1662       [Disassembles_as [Use_operands
1663         [| VecArray (4, Element_of_dreg);
1664            CstPtrTo Corereg |]]],
1665       Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane",
1666       store_3, [P16; F32; U16; U32; S16; S32];
1667
1668     (* Logical operations. And.  *)
1669     Vand, [], All (3, Dreg), "vand", notype_2, su_8_64;
1670     Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64;
1671
1672     (* Or.  *)
1673     Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_64;
1674     Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64;
1675
1676     (* Eor.  *)
1677     Veor, [], All (3, Dreg), "veor", notype_2, su_8_64;
1678     Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64;
1679
1680     (* Bic (And-not).  *)
1681     Vbic, [], All (3, Dreg), "vbic", notype_2, su_8_64;
1682     Vbic, [], All (3, Qreg), "vbicQ", notype_2, su_8_64;
1683
1684     (* Or-not.  *)
1685     Vorn, [], All (3, Dreg), "vorn", notype_2, su_8_64;
1686     Vorn, [], All (3, Qreg), "vornQ", notype_2, su_8_64;
1687   ]
1688
1689 let reinterp =
1690   let elems = P8 :: P16 :: F32 :: su_8_64 in
1691   List.fold_right
1692     (fun convto acc ->
1693       let types = List.fold_right
1694         (fun convfrom acc ->
1695           if convfrom <> convto then
1696             Cast (convto, convfrom) :: acc
1697           else
1698             acc)
1699         elems
1700         []
1701       in
1702         let dconv = Vreinterp, [No_op], Use_operands [| Dreg; Dreg |],
1703                       "vreinterpret", conv_1, types
1704         and qconv = Vreinterp, [No_op], Use_operands [| Qreg; Qreg |],
1705                       "vreinterpretQ", conv_1, types in
1706         dconv :: qconv :: acc)
1707     elems
1708     []
1709
1710 (* Output routines.  *)
1711
1712 let rec string_of_elt = function
1713     S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64"
1714   | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64"
1715   | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64"
1716   | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64"
1717   | F32 -> "f32" | P8 -> "p8" | P16 -> "p16"
1718   | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b
1719   | NoElts -> failwith "No elts"
1720
1721 let string_of_elt_dots elt =
1722   match elt with
1723     Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b
1724   | _ -> string_of_elt elt
1725
1726 let string_of_vectype vt =
1727   let rec name affix = function
1728     T_int8x8 -> affix "int8x8"
1729   | T_int8x16 -> affix "int8x16"
1730   | T_int16x4 -> affix "int16x4"
1731   | T_int16x8 -> affix "int16x8"
1732   | T_int32x2 -> affix "int32x2"
1733   | T_int32x4 -> affix "int32x4"
1734   | T_int64x1 -> affix "int64x1"
1735   | T_int64x2 -> affix "int64x2"
1736   | T_uint8x8 -> affix "uint8x8"
1737   | T_uint8x16 -> affix "uint8x16"
1738   | T_uint16x4 -> affix "uint16x4"
1739   | T_uint16x8 -> affix "uint16x8"
1740   | T_uint32x2 -> affix "uint32x2"
1741   | T_uint32x4 -> affix "uint32x4"
1742   | T_uint64x1 -> affix "uint64x1"
1743   | T_uint64x2 -> affix "uint64x2"
1744   | T_float32x2 -> affix "float32x2"
1745   | T_float32x4 -> affix "float32x4"
1746   | T_poly8x8 -> affix "poly8x8"
1747   | T_poly8x16 -> affix "poly8x16"
1748   | T_poly16x4 -> affix "poly16x4"
1749   | T_poly16x8 -> affix "poly16x8"
1750   | T_int8 -> affix "int8"
1751   | T_int16 -> affix "int16"
1752   | T_int32 -> affix "int32"
1753   | T_int64 -> affix "int64"
1754   | T_uint8 -> affix "uint8"
1755   | T_uint16 -> affix "uint16"
1756   | T_uint32 -> affix "uint32"
1757   | T_uint64 -> affix "uint64"
1758   | T_poly8 -> affix "poly8"
1759   | T_poly16 -> affix "poly16"
1760   | T_float32 -> affix "float32"
1761   | T_immediate _ -> "const int"
1762   | T_void -> "void"
1763   | T_intQI -> "__builtin_neon_qi"
1764   | T_intHI -> "__builtin_neon_hi"
1765   | T_intSI -> "__builtin_neon_si"
1766   | T_intDI -> "__builtin_neon_di"
1767   | T_arrayof (num, base) ->
1768       let basename = name (fun x -> x) base in
1769       affix (Printf.sprintf "%sx%d" basename num)
1770   | T_ptrto x ->
1771       let basename = name affix x in
1772       Printf.sprintf "%s *" basename
1773   | T_const x ->
1774       let basename = name affix x in
1775       Printf.sprintf "const %s" basename
1776   in
1777     name (fun x -> x ^ "_t") vt
1778
1779 (* LLVM LOCAL begin Print builtin type names that include the vector type.  *)
1780 let string_of_inttype = function
1781     B_TId8mode  -> "__builtin_neon_v8qi2"
1782   | B_TId16mode -> "__builtin_neon_v4hi2"
1783   | B_TId32mode -> "__builtin_neon_v2si2"
1784   | B_TId64mode -> "__builtin_neon_v1di2"
1785   | B_TIdSFmode -> "__builtin_neon_v2sf2"
1786   | B_EId8mode  -> "__builtin_neon_v8qi3"
1787   | B_EId16mode -> "__builtin_neon_v4hi3"
1788   | B_EId32mode -> "__builtin_neon_v2si3"
1789   | B_EId64mode -> "__builtin_neon_v1di3"
1790   | B_EIdSFmode -> "__builtin_neon_v2sf3"
1791   | B_OId8mode  -> "__builtin_neon_v8qi4"
1792   | B_OId16mode -> "__builtin_neon_v4hi4"
1793   | B_OId32mode -> "__builtin_neon_v2si4"
1794   | B_OId64mode -> "__builtin_neon_v1di4"
1795   | B_OIdSFmode -> "__builtin_neon_v2sf4"
1796   | B_OIq8mode  -> "__builtin_neon_v16qi2"
1797   | B_OIq16mode -> "__builtin_neon_v8hi2"
1798   | B_OIq32mode -> "__builtin_neon_v4si2"
1799   | B_OIq64mode -> "__builtin_neon_v2di2"
1800   | B_OIqSFmode -> "__builtin_neon_v4sf2"
1801   | B_CIq8mode  -> "__builtin_neon_v16qi3"
1802   | B_CIq16mode -> "__builtin_neon_v8hi3"
1803   | B_CIq32mode -> "__builtin_neon_v4si3"
1804   | B_CIq64mode -> "__builtin_neon_v2di3"
1805   | B_CIqSFmode -> "__builtin_neon_v4sf3"
1806   | B_XIq8mode  -> "__builtin_neon_v16qi4"
1807   | B_XIq16mode -> "__builtin_neon_v8hi4"
1808   | B_XIq32mode -> "__builtin_neon_v4si4"
1809   | B_XIq64mode -> "__builtin_neon_v2di4"
1810   | B_XIqSFmode -> "__builtin_neon_v4sf4"
1811 (* LLVM LOCAL end Print builtin type names that include the vector type.  *)
1812
1813 let string_of_mode = function
1814     V8QI -> "v8qi" | V4HI  -> "v4hi"  | V2SI -> "v2si" | V2SF -> "v2sf"
1815   | DI   -> "di"   | V16QI -> "v16qi" | V8HI -> "v8hi" | V4SI -> "v4si"
1816   | V4SF -> "v4sf" | V2DI  -> "v2di"  | QI -> "qi" | HI -> "hi" | SI -> "si"
1817 (* LLVM LOCAL *)
1818   | SF -> "sf"     | V1DI  -> "v1di"
1819
1820 (* Use uppercase chars for letters which form part of the intrinsic name, but
1821    should be omitted from the builtin name (the info is passed in an extra
1822    argument, instead).  *)
1823 let intrinsic_name name = String.lowercase name
1824
1825 (* Allow the name of the builtin to be overridden by things (e.g. Flipped)
1826    found in the features list.  *)
1827 let builtin_name features name =
1828   let name = List.fold_right
1829                (fun el name ->
1830                  match el with
1831                    Flipped x | Builtin_name x -> x
1832                  | _ -> name)
1833                features name in
1834   let islower x = let str = String.make 1 x in (String.lowercase str) = str
1835   and buf = Buffer.create (String.length name) in
1836   String.iter (fun c -> if islower c then Buffer.add_char buf c) name;
1837   Buffer.contents buf
1838
1839 (* Transform an arity into a list of strings.  *)
1840 let strings_of_arity a =
1841   match a with
1842   | Arity0 vt -> [string_of_vectype vt]
1843   | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2]
1844   | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1;
1845                                string_of_vectype vt2;
1846                                string_of_vectype vt3]
1847   | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1;
1848                                     string_of_vectype vt2;
1849                                     string_of_vectype vt3;
1850                                     string_of_vectype vt4]
1851   | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1;
1852                                          string_of_vectype vt2;
1853                                          string_of_vectype vt3;
1854                                          string_of_vectype vt4;
1855                                          string_of_vectype vt5]
1856
1857 (* Suffixes on the end of builtin names that are to be stripped in order
1858    to obtain the name used as an instruction.  They are only stripped if
1859    preceded immediately by an underscore.  *)
1860 let suffixes_to_strip = [ "n"; "lane"; "dup" ]
1861
1862 (* Get the possible names of an instruction corresponding to a "name" from the
1863    ops table.  This is done by getting the equivalent builtin name and
1864    stripping any suffixes from the list at the top of this file, unless
1865    the features list presents with an Instruction_name entry, in which
1866    case that is used; or unless the features list presents with a Flipped
1867    entry, in which case that is used.  If both such entries are present,
1868    the first in the list will be chosen.  *)
1869 let get_insn_names features name =
1870   let names = try
1871   begin
1872     match List.find (fun feature -> match feature with
1873                                       Instruction_name _ -> true
1874                                     | Flipped _ -> true
1875                                     | _ -> false) features
1876     with
1877       Instruction_name names -> names
1878     | Flipped name -> [name]
1879     | _ -> assert false
1880   end
1881   with Not_found -> [builtin_name features name]
1882   in
1883   begin
1884     List.map (fun name' ->
1885       try
1886         let underscore = String.rindex name' '_' in
1887         let our_suffix = String.sub name' (underscore + 1)
1888                                     ((String.length name') - underscore - 1)
1889         in
1890           let rec strip remaining_suffixes =
1891             match remaining_suffixes with
1892               [] -> name'
1893             | s::ss when our_suffix = s -> String.sub name' 0 underscore
1894             | _::ss -> strip ss
1895           in
1896             strip suffixes_to_strip
1897       with (Not_found | Invalid_argument _) -> name') names
1898   end
1899
1900 (* Apply a function to each element of a list and then comma-separate
1901    the resulting strings.  *)
1902 let rec commas f elts acc =
1903   match elts with
1904     [] -> acc
1905   | [elt] -> acc ^ (f elt)
1906   | elt::elts ->
1907     commas f elts (acc ^ (f elt) ^ ", ")
1908
1909 (* Given a list of features and the shape specified in the "ops" table, apply
1910    a function to each possible shape that the instruction may have.
1911    By default, this is the "shape" entry in "ops".  If the features list
1912    contains a Disassembles_as entry, the shapes contained in that entry are
1913    mapped to corresponding outputs and returned in a list.  If there is more
1914    than one Disassembles_as entry, only the first is used.  *)
1915 let analyze_all_shapes features shape f =
1916   try
1917     match List.find (fun feature ->
1918                        match feature with Disassembles_as _ -> true
1919                                         | _ -> false)
1920                     features with
1921       Disassembles_as shapes -> List.map f shapes
1922     | _ -> assert false
1923   with Not_found -> [f shape]
1924