OPTIMIZATIONS

   1 #1
   2 (defun mysl (s)
   3     (declare (simple-string s))
   4     (declare (optimize (speed 3) (safety 0) (debug 0)))
   5     (let ((c 0))
   6       (declare (fixnum c))
   7       (dotimes (i (length s))
   8         (when (eql (aref s i) #\1)
   9           (incf c)))
  10       c))
  11
  12 * On X86 I is represented as a tagged integer.
  13
  14 * Unnecessary move:
  15   3: SLOT S!11[EDX] {SB-C::VECTOR-LENGTH 1 7} => t23[EAX]
  16   4: MOVE t23[EAX] => t24[EBX]
  17
  18 --------------------------------------------------------------------------------
  19 #2
  20 (defun quux (v)
  21   (declare (optimize (speed 3) (safety 0) (space 2) (debug 0)))
  22   (declare (type (simple-array double-float 1) v))
  23   (let ((s 0d0))
  24     (declare (type double-float s))
  25     (dotimes (i (length v))
  26       (setq s (+ s (aref v i))))
  27     s))
  28
  29 * Python does not combine + with AREF, so generates extra move and
  30   allocates a register.
  31
  32 * On X86 Python thinks that all FP registers are directly accessible
  33   and emits costy MOVE ... => FR1.
  34
  35 --------------------------------------------------------------------------------
  36 #3
  37 (defun bar (n)
  38   (declare (optimize (speed 3) (safety 0) (space 2))
  39            (type fixnum n))
  40   (let ((v (make-list n)))
  41     (setq v (make-array n))
  42     (length v)))
  43
  44 * IR1 does not optimize away (MAKE-LIST N).
  45 --------------------------------------------------------------------------------
  46 #4
  47 (defun bar (v1 v2)
  48   (declare (optimize (speed 3) (safety 0) (space 2))
  49            (type (simple-array base-char 1) v1 v2))
  50   (dotimes (i (length v1))
  51     (setf (aref v2 i) (aref v1 i))))
  52
  53 VOP DATA-VECTOR-SET/SIMPLE-STRING V2!14[EDI] t32[EAX] t30[S2]>t33[CL]
  54                                   => t34[S2]<t35[AL]
  55         MOV     #<TN t33[CL]>, #<TN t30[S2]>
  56         MOV     BYTE PTR [EDI+EAX+1], #<TN t33[CL]>
  57         MOV     #<TN t35[AL]>, #<TN t33[CL]>
  58         MOV     #<TN t34[S2]>, #<TN t35[AL]>
  59
  60 * The value of DATA-VECTOR-SET is not used, so there is no need in the
  61   last two moves.
  62
  63 * And why two moves?
  64 --------------------------------------------------------------------------------
  65 #8
  66 (defun foo (d)
  67   (declare (optimize (speed 3) (safety 0) (debug 0)))
  68   (declare (type (double-float 0d0 1d0) d))
  69   (loop for i fixnum from 1 to 5
  70         for x1 double-float = (sin d) ;;; !!!
  71         do (loop for j fixnum from 1 to 4
  72                  sum x1 double-float)))
  73
  74 Without the marked declaration Python will use boxed representation for X1.
  75
  76 This is equivalent to
  77
  78 (let ((x nil))
  79   (setq x 0d0)
  80   ;; use of X as DOUBLE-FLOAT
  81 )
  82
  83 The initial binding is effectless, and without it X is of type
  84 DOUBLE-FLOAT. Unhopefully, IR1 does not optimize away effectless
  85 SETs/bindings, and IR2 does not perform type inference.
  86 --------------------------------------------------------------------------------
  87 #9 "Multi-path constant folding"
  88 (defun foo (x)
  89   (if (= (cond ((irgh x) 0)
  90                ((buh x) 1)
  91                (t 2))
  92          0)
  93       :yes
  94       :no))
  95
  96 This code could be optimized to
  97
  98 (defun foo (x)
  99   (cond ((irgh x) :yes)
 100         ((buh x) :no)
 101         (t :no)))
 102 --------------------------------------------------------------------------------
 103 #11
 104 (inverted variant of #9)
 105
 106 (lambda (x)
 107   (let ((y (sap-alien x c-string)))
 108     (list (alien-sap y)
 109           (alien-sap y))))
 110
 111 It could be optimized to
 112
 113 (lambda (x) (list x x))
 114
 115 (if Y were used only once, the current compiler would optimize it)
 116 --------------------------------------------------------------------------------
 117 #12
 118 (typep (truly-the (simple-array * (*)) x) 'simple-vector)
 119
 120 tests lowtag.
 121 --------------------------------------------------------------------------------
 122 #13
 123 FAST-+/FIXNUM and similar should accept unboxed arguments in interests
 124 of representation selection. Problem: inter-TN dependencies.
 125 --------------------------------------------------------------------------------
 126 #14
 127 The derived type of (/ (THE (DOUBLE-FLOAT (0D0)) X) (THE (DOUBLE-FLOAT
 128 1D0) Y)) is (DOUBLE-FLOAT 0.0d0). While it might be reasonable, it is
 129 better to derive (OR (MEMBER 0.0d0) (DOUBLE-FLOAT (0.0d0))).
 130 --------------------------------------------------------------------------------
 131 #15
 132 On the alpha, the system is reluctant to refer directly to a constant bignum,
 133 preferring to load a large constant through a slow sequence of instructions,
 134 then cons up a bignum for it:
 135
 136 (LAMBDA (A)
 137   (DECLARE (OPTIMIZE (SAFETY 1) (SPEED 3) (DEBUG 1))
 138            (TYPE (INTEGER -10000 10000) A)
 139            (IGNORABLE A))
 140   (CASE A
 141     ((89 125 16) (ASH A (MIN 18 -706)))
 142     (T (DPB -3 (BYTE 30 30) -1))))
 143 --------------------------------------------------------------------------------
 144 #16
 145 (do ((i 0 (1+ i)))
 146     ((= i (the (integer 0 100) n)))
 147   ...)
 148
 149 It is commonly expected for Python to derive (FIXNUMP I). (If ``='' is
 150 replaced with ``>='', Python will do.)
 151 --------------------------------------------------------------------------------
 152 #17
 153 Type tests for (ARRAY BIT), (ARRAY T) and similar go through full
 154 %TYPEP, even though it is relatively simple to establish the arrayness
 155 of an object and also to obtain the element type of an array.  As of
 156 sbcl-0.8.12.30, this affects at least DUMP-OBJECT through
 157 COMPOUND-OBJECT-P, and (LABELS MAYBE-EMIT-MAKE-LOAD-FORMS GROVEL)
 158 through TYPEP UNBOXED-ARRAY, within the compiler itself.
 159 --------------------------------------------------------------------------------
 160 #18
 161 (lambda (x) (declare (null x)) (sxhash x)) goes through SYMBOL-HASH
 162 rather than either constant-folding or manipulating NIL-VALUE or
 163 NULL-TN directly.
 164 --------------------------------------------------------------------------------
 165 #19
 166   (let ((dx (if (foo)
 167                 (list x)
 168                 (list y z))))
 169     (declare (dynamic-extent dx))
 170     ...)
 171
 172 DX is not allocated on stack.
 173 --------------------------------------------------------------------------------
 174 #20
 175 (defun-with-dx foo (x)
 176   (flet ((make (x)
 177            (let ((l (list nil nil)))
 178              (setf (first l) x)
 179              (setf (second l) (1- x))
 180              l)))
 181     (let ((l (make x)))
 182       (declare (dynamic-extent l))
 183       (mapc #'print l))))
 184
 185 Result of MAKE is not stack allocated, which means that
 186 stack-allocation of structures is impossible.
 187 --------------------------------------------------------------------------------
 188 #21
 189 (defun-with-dx foo ()
 190   (let ((dx (list (list 1 2) (list 3 4))))
 191     (declare (dynamic-extent dx))
 192     ...))
 193
 194 External list in DX is allocated on stack, but internal are not.
 195 --------------------------------------------------------------------------------
 196 #22
 197 IR2 does not perform unused code flushing.
 198 --------------------------------------------------------------------------------
 199 #23
 200 Python does not know that &REST lists are LISTs (and cannot derive it).
 201 --------------------------------------------------------------------------------
 202 #24
 203 a. Iterations on &REST lists, returning them as VALUES could be
 204    rewritten with &MORE vectors.
 205 b. Implement local unknown-values mv-call (useful for fast type checking).
 206 --------------------------------------------------------------------------------
 207 #26
 208 SBCL cannot derive upper bound for I and uses generic arithmetic here:
 209
 210 (defun foo (l)
 211   (declare (vector l))
 212   (dotimes (i (length l))
 213     (if (block nil
 214           (map-foo (lambda (x) (if x (return t)))
 215                    l))
 216         t
 217         nil)))
 218
 219 (So the constraint propagator or a possible future SSA-convertor
 220 should know the connection between an NLE and its CLEANUP.)
 221 --------------------------------------------------------------------------------
 222 #27
 223 Initialization of stack-allocated arrays is inefficient: we always
 224 fill the vector with zeroes, even when it is not needed (as for
 225 platforms with conservative GC or for arrays of unboxed objectes) and
 226 is performed later explicitely.
 227
 228 (This is harder than it might look at first glance, as MAKE-ARRAY is smart
 229 enough to eliminate something like ':initial-element 0'.  Such an optimization
 230 is valid if the vector is being allocated in the heap, but not if it is being
 231 allocated on the stack.  You could remove this optimization, but that makes
 232 the heap-allocated case somewhat slower...)
 233 --------------------------------------------------------------------------------
 234 #28
 235 a. Accessing raw slots in structure instances is more inefficient than
 236 it could be; if we placed raw slots before the header word, we would
 237 not need to do arithmetic at runtime to access them.  (But beware:
 238 this would complicate handling of the interior pointer).
 239
 240 b. (Also note that raw slots are currently disabled on HPPA)
 241 --------------------------------------------------------------------------------
 242 #29
 243 Python is overly zealous when converting high-level CL functions, such
 244 as MIN/MAX, LOGBITP, and LOGTEST, to low-level CL functions.  Reducing
 245 Python's aggressiveness would make it easier to effect changes such as
 246
 247 x86-64:
 248 * direct MIN/MAX on {SINGLE,DOUBLE}-FLOATs ({MIN,MAX}S{S,D})
 249
 250 x86-64:
 251 * direct LOGBITP on word-sized integers and fixnums (BT + JC)
 252
 253 x86{,-64}/PPC:
 254 * branch-free MIN/MAX on word-sized integers and fixnums (floats could
 255   be handled too, modulo safety considerations on the PPC)
 256
 257 x86-64:
 258 * efficient LOGTESTs on word-sized integers and fixnums (TEST)
 259
 260 etc., etc.
 261
 262 (The framework for this has been implemented as of 0.9.9.18; see the
 263 vm-support-routine COMBINATION-IMPLEMENTATION-STYLE and its use in
 264 src/compiler/ir1opt.lisp, IR1-OPTIMIZE-COMBINATION.  The above
 265 optimizations are left as an exercise for the reader.)
 266 --------------------------------------------------------------------------------
 267 #30
 268 (defun foo (x y)
 269   (< x y))
 270
 271 FOO's IR1 representation is roughly:
 272
 273 (defun foo (x y)
 274   (if (< x y)
 275       T
 276       NIL))
 277
 278 However, if a full call is generated for < (and similarly for other
 279 predicate functions), then the IF is unnecessary, since the return value
 280 of (< x y) is already T or NIL.
 281 --------------------------------------------------------------------------------
 282 #31
 283 The typecheck generated for a declaration like (integer 0 45) on x86 looks
 284 like:
 285
 286 ;      12B:       F6C203           TEST DL, 3
 287 ;      12E:       753B             JNE L1
 288 ;      130:       8BC2             MOV EAX, EDX
 289 ;      132:       83F800           CMP EAX, 0
 290 ;      135:       7C34             JL L1
 291 ;      137:       8BC2             MOV EAX, EDX
 292 ;      139:       3DB4000000       CMP EAX, 180
 293 ;      13E:       7F2B             JNLE L1
 294
 295 A better code sequence for this would be:
 296
 297   TEST DL, 3
 298   JNE L1
 299   MOV EAX, EDX
 300   CMP EAX, 180
 301   JBE L1
 302
 303 Doing an unsigned comparison means that, similarly to %CHECK-BOUND, we can
 304 combine the <0 and >=bound tests.  This sort of test is generated often
 305 in SBCL and any array-based code that's serious about type-checking its
 306 indices.
 307 --------------------------------------------------------------------------------
 308 #32
 309 The code for a vector bounds check on x86 (similarly on x86-64) where
 310 the vector is in EDX and the index in EAX looks like:
 311
 312 ;       49: L0:   8B5AFD           MOV EBX, [EDX-3]
 313 ;       4C:       39C3             CMP EBX, EAX
 314 ;       4E:       7632             JBE L2
 315
 316 because %CHECK-BOUND is used for bounds-checking any array dimension.
 317 A more efficient specialization (%CHECK-BOUND/VECTOR) would produce:
 318
 319   CMP [EDX-3], EAX
 320   JBE L2
 321
 322 Which is slightly shorter and avoids using a register.
 323 --------------------------------------------------------------------------------
 324 #33
 325 Reports from the Java camp indicate that using an SSE2-based
 326 floating-point backend on x86 when possible is highly preferable to
 327 using the x86 FP stack.  It would be nice if SBCL included an SSE2-based
 328 floating point backend with a compile-time option to switch between the
 329 two.
 330 --------------------------------------------------------------------------------
 331 #34
 332 Compiling
 333
 334 (defun foo (x y)
 335   (declare (type (integer 0 45) x y))
 336   (+ x y))
 337
 338 results in the following error trapping code for type-checking the
 339 arguments:
 340
 341 ;      424: L0:   8B058CE31812     MOV EAX, [#x1218E38C]      ; '(MOD 46)
 342 ;      42A:       0F0B0A           BREAK 10                   ; error trap
 343 ;      42D:       05               BYTE #X05
 344 ;      42E:       1F               BYTE #X1F                  ; OBJECT-NOT-TYPE-ERROR
 345 ;      42F:       FECE01           BYTE #XFE, #XCE, #X01      ; EDI
 346 ;      432:       0E               BYTE #X0E                  ; EAX
 347 ;      433: L1:   8B0590E31812     MOV EAX, [#x1218E390]      ; '(MOD 46)
 348 ;      439:       0F0B0A           BREAK 10                   ; error trap
 349 ;      43C:       03               BYTE #X03
 350 ;      43D:       1F               BYTE #X1F                  ; OBJECT-NOT-TYPE-ERROR
 351 ;      43E:       8E               BYTE #X8E                  ; EDX
 352 ;      43F:       0E               BYTE #X0E                  ; EAX
 353
 354 Notice that '(MOD 46) has two entries in the constant vector.  Having
 355 one would be preferable.
 356 --------------------------------------------------------------------------------
 357 #35
 358 Compiling
 359
 360 (defun foo (a i)
 361   (declare (type simple-vector a))
 362   (aref a i))
 363
 364 results in the following x86 code:
 365
 366 ; 115886E9:       F7C703000000     TEST EDI, 3                ; no-arg-parsing entry point
 367 ;      6EF:       7510             JNE L0
 368 ;      6F1:       8BC7             MOV EAX, EDI
 369 ;      6F3:       83F800           CMP EAX, 0
 370 ;      6F6:       7C09             JL L0
 371 ;      6F8:       8BC7             MOV EAX, EDI
 372 ;      6FA:       3DF8FFFF7F       CMP EAX, 2147483640
 373 ;      6FF:       7E0F             JLE L1
 374 ;      701: L0:   8B057C865811     MOV EAX, [#x1158867C]      ; '(MOD
 375                                                               ;   536870911)
 376 ;      707:       0F0B0A           BREAK 10                   ; error trap
 377 ;      70A:       05               BYTE #X05
 378 ;      70B:       1F               BYTE #X1F                  ; OBJECT-NOT-TYPE-ERROR
 379 ;      70C:       FECE01           BYTE #XFE, #XCE, #X01      ; EDI
 380 ;      70F:       0E               BYTE #X0E                  ; EAX
 381 ;      710: L1:   8B42FD           MOV EAX, [EDX-3]
 382 ;      713:       8BCF             MOV ECX, EDI
 383 ;      715:       39C8             CMP EAX, ECX
 384 ;      717:       7620             JBE L2
 385 ;      719:       8B540A01         MOV EDX, [EDX+ECX+1]
 386
 387 ... plus the standard return sequence and some error blocks.  The
 388 `TEST EDI, 3' and associated comparisons are to ensure that `I' is a
 389 positive fixnum.  The associated comparisons are unnecessary, as the
 390 %CHECK-BOUND VOP only requires its tested index to be a fixnum and takes
 391 care of the negative fixnum case itself.
 392
 393 {HAIRY-,}DATA-VECTOR-REF are DEFKNOWN'd with EXPLICIT-CHECK, which would
 394 seem to take care of this, but EXPLICIT-CHECK only seems to be used when
 395 compiling calls to unknown functions or similar.  Furthermore,
 396 EXPLICIT-CHECK, as NJF understands it, doesn't have the right
 397 semantics--it suppresses all type checking of arguments, whereas what we
 398 really want is to ensure that the argument is a fixnum, but not check
 399 its positiveness.
 400 --------------------------------------------------------------------------------
 401 #36
 402
 403 In #35, the CMP EAX, $foo instructions are all preceded by a MOV.  They
 404 appear to be unnecessary, but are necessary because in IR2, EDI is a
 405 DESCRIPTOR-REG, whereas EAX is an ANY-REG--and the comparison VOPs only
 406 accept ANY-REGs.  Therefore, the MOVs are "necessary" to ensure that the
 407 comparison VOP receives an TN of the appropriate storage class.
 408
 409 Obviously, it would be better if a) we only performed one MOV prior to
 410 all three comparisons or b) eliminated the necessity of the MOV(s)
 411 altogether.  The former option is probably easier than the latter.
 412
 413 --------------------------------------------------------------------------------
 414 #37
 415
 416 Dynamic extent allocation doesn't currently work for one-element lists,
 417 since there's a source transform from (LIST X) to (CONS X NIL).
 418
 419 --------------------------------------------------------------------------------
 420 #38
 421
 422 (setf (subseq s1 start1 end1) (subseq s2 start2 end1))
 423
 424 could be transformed into
 425
 426 (let ((#:s2 s2)
 427       (#:start2 start2)
 428       (#:end2 end2))
 429  (replace s1 #:s2 :start1 start1 :end1 end1 :start2 #:start2 :end2 #:end2))
 430
 431 when the return value is unused, avoiding the need to cons up the new sequence.
 432
 433 --------------------------------------------------------------------------------
 434 #39
 435
 436 (let ((*foo* 42)) ...)
 437
 438 currently compiles to code that ensures the TLS index at runtime, which
 439 is both a decently large chunk of code and unnecessary, as we could ensure
 440 the TLS index at load-time as well.
 441