OPTIMIZATIONS

   1 #1
   2 (defun mysl (s)
   3     (declare (simple-string s))
   4     (declare (optimize (speed 3) (safety 0) (debug 0)))
   5     (let ((c 0))
   6       (declare (fixnum c))
   7       (dotimes (i (length s))
   8         (when (eql (aref s i) #\1)
   9           (incf c)))
  10       c))
  11
  12 * On X86 I is represented as a tagged integer.
  13
  14 * Unnecessary move:
  15   3: SLOT S!11[EDX] {SB-C::VECTOR-LENGTH 1 7} => t23[EAX]
  16   4: MOVE t23[EAX] => t24[EBX]
  17
  18 --------------------------------------------------------------------------------
  19 #2
  20 (defun quux (v)
  21   (declare (optimize (speed 3) (safety 0) (space 2) (debug 0)))
  22   (declare (type (simple-array double-float 1) v))
  23   (let ((s 0d0))
  24     (declare (type double-float s))
  25     (dotimes (i (length v))
  26       (setq s (+ s (aref v i))))
  27     s))
  28
  29 * Python does not combine + with AREF, so generates extra move and
  30   allocates a register.
  31
  32 * On X86 Python thinks that all FP registers are directly accessible
  33   and emits costy MOVE ... => FR1.
  34
  35 --------------------------------------------------------------------------------
  36 #3
  37 (defun bar (n)
  38   (declare (optimize (speed 3) (safety 0) (space 2))
  39            (type fixnum n))
  40   (let ((v (make-list n)))
  41     (setq v (make-array n))
  42     (length v)))
  43
  44 * IR1 does not optimize away (MAKE-LIST N).
  45 --------------------------------------------------------------------------------
  46 #4
  47 (defun bar (v1 v2)
  48   (declare (optimize (speed 3) (safety 0) (space 2))
  49            (type (simple-array base-char 1) v1 v2))
  50   (dotimes (i (length v1))
  51     (setf (aref v2 i) (aref v1 i))))
  52
  53 VOP DATA-VECTOR-SET/SIMPLE-STRING V2!14[EDI] t32[EAX] t30[S2]>t33[CL]
  54                                   => t34[S2]<t35[AL]
  55         MOV     #<TN t33[CL]>, #<TN t30[S2]>
  56         MOV     BYTE PTR [EDI+EAX+1], #<TN t33[CL]>
  57         MOV     #<TN t35[AL]>, #<TN t33[CL]>
  58         MOV     #<TN t34[S2]>, #<TN t35[AL]>
  59
  60 * The value of DATA-VECTOR-SET is not used, so there is no need in the
  61   last two moves.
  62
  63 * And why two moves?
  64 --------------------------------------------------------------------------------
  65 #8
  66 (defun foo (d)
  67   (declare (optimize (speed 3) (safety 0) (debug 0)))
  68   (declare (type (double-float 0d0 1d0) d))
  69   (loop for i fixnum from 1 to 5
  70         for x1 double-float = (sin d) ;;; !!!
  71         do (loop for j fixnum from 1 to 4
  72                  sum x1 double-float)))
  73
  74 Without the marked declaration Python will use boxed representation for X1.
  75
  76 This is equivalent to
  77
  78 (let ((x nil))
  79   (setq x 0d0)
  80   ;; use of X as DOUBLE-FLOAT
  81 )
  82
  83 The initial binding is effectless, and without it X is of type
  84 DOUBLE-FLOAT. Unhopefully, IR1 does not optimize away effectless
  85 SETs/bindings, and IR2 does not perform type inference.
  86 --------------------------------------------------------------------------------
  87 #9 "Multi-path constant folding"
  88 (defun foo (x)
  89   (if (= (cond ((irgh x) 0)
  90                ((buh x) 1)
  91                (t 2))
  92          0)
  93       :yes
  94       :no))
  95
  96 This code could be optimized to
  97
  98 (defun foo (x)
  99   (cond ((irgh x) :yes)
 100         ((buh x) :no)
 101         (t :no)))
 102 --------------------------------------------------------------------------------
 103 #11
 104 (inverted variant of #9)
 105
 106 (lambda (x)
 107   (let ((y (sap-alien x c-string)))
 108     (list (alien-sap y)
 109           (alien-sap y))))
 110
 111 It could be optimized to
 112
 113 (lambda (x) (list x x))
 114
 115 (if Y were used only once, the current compiler would optimize it)
 116 --------------------------------------------------------------------------------
 117 #12
 118 (typep (truly-the (simple-array * (*)) x) 'simple-vector)
 119
 120 tests lowtag.
 121 --------------------------------------------------------------------------------
 122 #13
 123 FAST-+/FIXNUM and similar should accept unboxed arguments in interests
 124 of representation selection. Problem: inter-TN dependencies.
 125 --------------------------------------------------------------------------------
 126 #14
 127 The derived type of (/ (THE (DOUBLE-FLOAT (0D0)) X) (THE (DOUBLE-FLOAT
 128 1D0) Y)) is (DOUBLE-FLOAT 0.0d0). While it might be reasonable, it is
 129 better to derive (OR (MEMBER 0.0d0) (DOUBLE-FLOAT (0.0d0))).
 130 --------------------------------------------------------------------------------
 131 #15
 132 On the alpha, the system is reluctant to refer directly to a constant bignum,
 133 preferring to load a large constant through a slow sequence of instructions,
 134 then cons up a bignum for it:
 135
 136 (LAMBDA (A)
 137   (DECLARE (OPTIMIZE (SAFETY 1) (SPEED 3) (DEBUG 1))
 138            (TYPE (INTEGER -10000 10000) A)
 139            (IGNORABLE A))
 140   (CASE A
 141     ((89 125 16) (ASH A (MIN 18 -706)))
 142     (T (DPB -3 (BYTE 30 30) -1))))
 143 --------------------------------------------------------------------------------
 144 #16
 145 (do ((i 0 (1+ i)))
 146     ((= i (the (integer 0 100) n)))
 147   ...)
 148
 149 It is commonly expected for Python to derive (FIXNUMP I). (If ``='' is
 150 replaced with ``>='', Python will do.)
 151 --------------------------------------------------------------------------------
 152 #17
 153 Type tests for (ARRAY BIT), (ARRAY T) and similar go through full
 154 %TYPEP, even though it is relatively simple to establish the arrayness
 155 of an object and also to obtain the element type of an array.  As of
 156 sbcl-0.8.12.30, this affects at least DUMP-OBJECT through
 157 COMPOUND-OBJECT-P, and (LABELS MAYBE-EMIT-MAKE-LOAD-FORMS GROVEL)
 158 through TYPEP UNBOXED-ARRAY, within the compiler itself.
 159 --------------------------------------------------------------------------------
 160 #22
 161 IR2 does not perform unused code flushing.
 162 --------------------------------------------------------------------------------
 163 #24
 164 a. Iterations on &REST lists could be rewritten with &MORE vectors.
 165 b. Implement local unknown-values mv-call (useful for fast type checking).
 166 --------------------------------------------------------------------------------
 167 #26
 168 SBCL cannot derive upper bound for I and uses generic arithmetic here:
 169
 170 (defun foo (l)
 171   (declare (vector l))
 172   (dotimes (i (length l))
 173     (if (block nil
 174           (map-foo (lambda (x) (if x (return t)))
 175                    l))
 176         t
 177         nil)))
 178
 179 (So the constraint propagator or a possible future SSA-convertor
 180 should know the connection between an NLE and its CLEANUP.)
 181 --------------------------------------------------------------------------------
 182 #27
 183 Initialization of stack-allocated arrays is inefficient: we always
 184 fill the vector with zeroes, even when it is not needed (as for
 185 platforms with conservative GC or for arrays of unboxed objectes) and
 186 is performed later explicitely.
 187
 188 (This is harder than it might look at first glance, as MAKE-ARRAY is smart
 189 enough to eliminate something like ':initial-element 0'.  Such an optimization
 190 is valid if the vector is being allocated in the heap, but not if it is being
 191 allocated on the stack.  You could remove this optimization, but that makes
 192 the heap-allocated case somewhat slower...)
 193
 194 To do this, extend ALLOCATE-VECTOR with ALLOW-JUNK argument, and when
 195 stack allocating don't zero if it is true -- and probably ALLOW-JUNK iff
 196 the vector is a specialized one (cannot have pointers.)
 197 --------------------------------------------------------------------------------
 198 #31
 199 The typecheck generated for a declaration like (integer 0 45) on x86 looks
 200 like:
 201
 202 ;      12B:       F6C203           TEST DL, 3
 203 ;      12E:       753B             JNE L1
 204 ;      130:       8BC2             MOV EAX, EDX
 205 ;      132:       83F800           CMP EAX, 0
 206 ;      135:       7C34             JL L1
 207 ;      137:       8BC2             MOV EAX, EDX
 208 ;      139:       3DB4000000       CMP EAX, 180
 209 ;      13E:       7F2B             JNLE L1
 210
 211 A better code sequence for this would be:
 212
 213   TEST DL, 3
 214   JNE L1
 215   MOV EAX, EDX
 216   CMP EAX, 180
 217   JBE L1
 218
 219 Doing an unsigned comparison means that, similarly to %CHECK-BOUND, we can
 220 combine the <0 and >=bound tests.  This sort of test is generated often
 221 in SBCL and any array-based code that's serious about type-checking its
 222 indices.
 223 --------------------------------------------------------------------------------
 224 #32
 225 The code for a vector bounds check on x86 (similarly on x86-64) where
 226 the vector is in EDX and the index in EAX looks like:
 227
 228 ;       49: L0:   8B5AFD           MOV EBX, [EDX-3]
 229 ;       4C:       39C3             CMP EBX, EAX
 230 ;       4E:       7632             JBE L2
 231
 232 because %CHECK-BOUND is used for bounds-checking any array dimension.
 233 A more efficient specialization (%CHECK-BOUND/VECTOR) would produce:
 234
 235   CMP [EDX-3], EAX
 236   JBE L2
 237
 238 Which is slightly shorter and avoids using a register.
 239 --------------------------------------------------------------------------------
 240 #33
 241 Reports from the Java camp indicate that using an SSE2-based
 242 floating-point backend on x86 when possible is highly preferable to
 243 using the x86 FP stack.  It would be nice if SBCL included an SSE2-based
 244 floating point backend with a compile-time option to switch between the
 245 two.
 246 --------------------------------------------------------------------------------
 247 #35
 248 Compiling
 249
 250 (defun foo (a i)
 251   (declare (type simple-vector a))
 252   (aref a i))
 253
 254 results in the following x86 code:
 255
 256 ; 115886E9:       F7C703000000     TEST EDI, 3                ; no-arg-parsing entry point
 257 ;      6EF:       7510             JNE L0
 258 ;      6F1:       8BC7             MOV EAX, EDI
 259 ;      6F3:       83F800           CMP EAX, 0
 260 ;      6F6:       7C09             JL L0
 261 ;      6F8:       8BC7             MOV EAX, EDI
 262 ;      6FA:       3DF8FFFF7F       CMP EAX, 2147483640
 263 ;      6FF:       7E0F             JLE L1
 264 ;      701: L0:   8B057C865811     MOV EAX, [#x1158867C]      ; '(MOD
 265                                                               ;   536870911)
 266 ;      707:       0F0B0A           BREAK 10                   ; error trap
 267 ;      70A:       05               BYTE #X05
 268 ;      70B:       1F               BYTE #X1F                  ; OBJECT-NOT-TYPE-ERROR
 269 ;      70C:       FECE01           BYTE #XFE, #XCE, #X01      ; EDI
 270 ;      70F:       0E               BYTE #X0E                  ; EAX
 271 ;      710: L1:   8B42FD           MOV EAX, [EDX-3]
 272 ;      713:       8BCF             MOV ECX, EDI
 273 ;      715:       39C8             CMP EAX, ECX
 274 ;      717:       7620             JBE L2
 275 ;      719:       8B540A01         MOV EDX, [EDX+ECX+1]
 276
 277 ... plus the standard return sequence and some error blocks.  The
 278 `TEST EDI, 3' and associated comparisons are to ensure that `I' is a
 279 positive fixnum.  The associated comparisons are unnecessary, as the
 280 %CHECK-BOUND VOP only requires its tested index to be a fixnum and takes
 281 care of the negative fixnum case itself.
 282
 283 {HAIRY-,}DATA-VECTOR-REF are DEFKNOWN'd with EXPLICIT-CHECK, which would
 284 seem to take care of this, but EXPLICIT-CHECK only seems to be used when
 285 compiling calls to unknown functions or similar.  Furthermore,
 286 EXPLICIT-CHECK, as NJF understands it, doesn't have the right
 287 semantics--it suppresses all type checking of arguments, whereas what we
 288 really want is to ensure that the argument is a fixnum, but not check
 289 its positiveness.
 290 --------------------------------------------------------------------------------
 291 #36
 292
 293 In #35, the CMP EAX, $foo instructions are all preceded by a MOV.  They
 294 appear to be unnecessary, but are necessary because in IR2, EDI is a
 295 DESCRIPTOR-REG, whereas EAX is an ANY-REG--and the comparison VOPs only
 296 accept ANY-REGs.  Therefore, the MOVs are "necessary" to ensure that the
 297 comparison VOP receives an TN of the appropriate storage class.
 298
 299 Obviously, it would be better if a) we only performed one MOV prior to
 300 all three comparisons or b) eliminated the necessity of the MOV(s)
 301 altogether.  The former option is probably easier than the latter.
 302
 303 --------------------------------------------------------------------------------
 304 #38
 305
 306 (setf (subseq s1 start1 end1) (subseq s2 start2 end1))
 307
 308 could be transformed into
 309
 310 (let ((#:s2 s2)
 311       (#:start2 start2)
 312       (#:end2 end2))
 313  (replace s1 #:s2 :start1 start1 :end1 end1 :start2 #:start2 :end2 #:end2))
 314
 315 when the return value is unused, avoiding the need to cons up the new sequence.
 316
 317 --------------------------------------------------------------------------------
 318 #39
 319
 320 (let ((*foo* 42)) ...)
 321
 322 currently compiles to code that ensures the TLS index at runtime, which
 323 is both a decently large chunk of code and unnecessary, as we could ensure
 324 the TLS index at load-time as well.
 325 [Note that x86-64 already does this.]
 326
 327 --------------------------------------------------------------------------------
 328 #40
 329
 330 When FTYPE is declared -- to say (function (t t t t t) t), and
 331 function has a compiler-macro,
 332
 333   (apply #'foo 'x1 x2 'x3 more)
 334
 335 can be transformed into
 336
 337   (apply (lambda (x2 x4 x5) (foo 'x1 x2 'x3 x4 x5)) x2 more)
 338
 339 which allows compiler-macro-expansion for FOO. (Only constant
 340 arguments can be moved inside the new lambda -- otherwise evaluation
 341 order is altered.)
 342
 343 --------------------------------------------------------------------------------
 344 #41
 345
 346 The unibyte external formats are written in a very generic way.  Three
 347 optimizations immediately applicable that could be automatically
 348 generated:
 349
 350 (a) if the external format merely permutes the first 256 characters, a
 351     constant-time lookup (rather than a binary search) could be
 352     performed on output.  This applies at least to EBCDIC, which
 353     currently has a hand-rolled mapper instead.
 354
 355 (b) if there are no undefined characters corresponding to the 256
 356     codes, then no error checking need be done on input.
 357
 358 (c) if there is a way to use particular bits of the exceptional
 359     characters, constant-time output (rather than binary search) can
 360     still be achieved as used to be done by the latin-9 external
 361     format before 1.0.31.