sys/dev/drm/i915/i915_gem_fence.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <drm/drmP.h>
  25 #include <drm/i915_drm.h>
  26 #include "i915_drv.h"
  27 #include "intel_drv.h"
  28
  29 /**
  30  * DOC: fence register handling
  31  *
  32  * Important to avoid confusions: "fences" in the i915 driver are not execution
  33  * fences used to track command completion but hardware detiler objects which
  34  * wrap a given range of the global GTT. Each platform has only a fairly limited
  35  * set of these objects.
  36  *
  37  * Fences are used to detile GTT memory mappings. They're also connected to the
  38  * hardware frontbuffer render tracking and hence interract with frontbuffer
  39  * conmpression. Furthermore on older platforms fences are required for tiled
  40  * objects used by the display engine. They can also be used by the render
  41  * engine - they're required for blitter commands and are optional for render
  42  * commands. But on gen4+ both display (with the exception of fbc) and rendering
  43  * have their own tiling state bits and don't need fences.
  44  *
  45  * Also note that fences only support X and Y tiling and hence can't be used for
  46  * the fancier new tiling formats like W, Ys and Yf.
  47  *
  48  * Finally note that because fences are such a restricted resource they're
  49  * dynamically associated with objects. Furthermore fence state is committed to
  50  * the hardware lazily to avoid unecessary stalls on gen2/3. Therefore code must
  51  * explictly call i915_gem_object_get_fence() to synchronize fencing status
  52  * for cpu access. Also note that some code wants an unfenced view, for those
  53  * cases the fence can be removed forcefully with i915_gem_object_put_fence().
  54  *
  55  * Internally these functions will synchronize with userspace access by removing
  56  * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
  57  */
  58
  59 static void i965_write_fence_reg(struct drm_device *dev, int reg,
  60                                  struct drm_i915_gem_object *obj)
  61 {
  62         struct drm_i915_private *dev_priv = dev->dev_private;
  63         int fence_reg;
  64         int fence_pitch_shift;
  65
  66         if (INTEL_INFO(dev)->gen >= 6) {
  67                 fence_reg = FENCE_REG_SANDYBRIDGE_0;
  68                 fence_pitch_shift = SANDYBRIDGE_FENCE_PITCH_SHIFT;
  69         } else {
  70                 fence_reg = FENCE_REG_965_0;
  71                 fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
  72         }
  73
  74         fence_reg += reg * 8;
  75
  76         /* To w/a incoherency with non-atomic 64-bit register updates,
  77          * we split the 64-bit update into two 32-bit writes. In order
  78          * for a partial fence not to be evaluated between writes, we
  79          * precede the update with write to turn off the fence register,
  80          * and only enable the fence as the last step.
  81          *
  82          * For extra levels of paranoia, we make sure each step lands
  83          * before applying the next step.
  84          */
  85         I915_WRITE(fence_reg, 0);
  86         POSTING_READ(fence_reg);
  87
  88         if (obj) {
  89                 u32 size = i915_gem_obj_ggtt_size(obj);
  90                 uint64_t val;
  91
  92                 /* Adjust fence size to match tiled area */
  93                 if (obj->tiling_mode != I915_TILING_NONE) {
  94                         uint32_t row_size = obj->stride *
  95                                 (obj->tiling_mode == I915_TILING_Y ? 32 : 8);
  96                         size = (size / row_size) * row_size;
  97                 }
  98
  99                 val = (uint64_t)((i915_gem_obj_ggtt_offset(obj) + size - 4096) &
 100                                  0xfffff000) << 32;
 101                 val |= i915_gem_obj_ggtt_offset(obj) & 0xfffff000;
 102                 val |= (uint64_t)((obj->stride / 128) - 1) << fence_pitch_shift;
 103                 if (obj->tiling_mode == I915_TILING_Y)
 104                         val |= 1 << I965_FENCE_TILING_Y_SHIFT;
 105                 val |= I965_FENCE_REG_VALID;
 106
 107                 I915_WRITE(fence_reg + 4, val >> 32);
 108                 POSTING_READ(fence_reg + 4);
 109
 110                 I915_WRITE(fence_reg + 0, val);
 111                 POSTING_READ(fence_reg);
 112         } else {
 113                 I915_WRITE(fence_reg + 4, 0);
 114                 POSTING_READ(fence_reg + 4);
 115         }
 116 }
 117
 118 static void i915_write_fence_reg(struct drm_device *dev, int reg,
 119                                  struct drm_i915_gem_object *obj)
 120 {
 121         struct drm_i915_private *dev_priv = dev->dev_private;
 122         u32 val;
 123
 124         if (obj) {
 125                 u32 size = i915_gem_obj_ggtt_size(obj);
 126                 int pitch_val;
 127                 int tile_width;
 128
 129                 WARN((i915_gem_obj_ggtt_offset(obj) & ~I915_FENCE_START_MASK) ||
 130                      (size & -size) != size ||
 131                      (i915_gem_obj_ggtt_offset(obj) & (size - 1)),
 132                      "object 0x%08lx [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n",
 133                      i915_gem_obj_ggtt_offset(obj), obj->map_and_fenceable, size);
 134
 135                 if (obj->tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
 136                         tile_width = 128;
 137                 else
 138                         tile_width = 512;
 139
 140                 /* Note: pitch better be a power of two tile widths */
 141                 pitch_val = obj->stride / tile_width;
 142                 pitch_val = ffs(pitch_val) - 1;
 143
 144                 val = i915_gem_obj_ggtt_offset(obj);
 145                 if (obj->tiling_mode == I915_TILING_Y)
 146                         val |= 1 << I830_FENCE_TILING_Y_SHIFT;
 147                 val |= I915_FENCE_SIZE_BITS(size);
 148                 val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 149                 val |= I830_FENCE_REG_VALID;
 150         } else
 151                 val = 0;
 152
 153         if (reg < 8)
 154                 reg = FENCE_REG_830_0 + reg * 4;
 155         else
 156                 reg = FENCE_REG_945_8 + (reg - 8) * 4;
 157
 158         I915_WRITE(reg, val);
 159         POSTING_READ(reg);
 160 }
 161
 162 static void i830_write_fence_reg(struct drm_device *dev, int reg,
 163                                 struct drm_i915_gem_object *obj)
 164 {
 165         struct drm_i915_private *dev_priv = dev->dev_private;
 166         uint32_t val;
 167
 168         if (obj) {
 169                 u32 size = i915_gem_obj_ggtt_size(obj);
 170                 uint32_t pitch_val;
 171
 172                 WARN((i915_gem_obj_ggtt_offset(obj) & ~I830_FENCE_START_MASK) ||
 173                      (size & -size) != size ||
 174                      (i915_gem_obj_ggtt_offset(obj) & (size - 1)),
 175                      "object 0x%08lx not 512K or pot-size 0x%08x aligned\n",
 176                      i915_gem_obj_ggtt_offset(obj), size);
 177
 178                 pitch_val = obj->stride / 128;
 179                 pitch_val = ffs(pitch_val) - 1;
 180
 181                 val = i915_gem_obj_ggtt_offset(obj);
 182                 if (obj->tiling_mode == I915_TILING_Y)
 183                         val |= 1 << I830_FENCE_TILING_Y_SHIFT;
 184                 val |= I830_FENCE_SIZE_BITS(size);
 185                 val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 186                 val |= I830_FENCE_REG_VALID;
 187         } else
 188                 val = 0;
 189
 190         I915_WRITE(FENCE_REG_830_0 + reg * 4, val);
 191         POSTING_READ(FENCE_REG_830_0 + reg * 4);
 192 }
 193
 194 inline static bool i915_gem_object_needs_mb(struct drm_i915_gem_object *obj)
 195 {
 196         return obj && obj->base.read_domains & I915_GEM_DOMAIN_GTT;
 197 }
 198
 199 static void i915_gem_write_fence(struct drm_device *dev, int reg,
 200                                  struct drm_i915_gem_object *obj)
 201 {
 202         struct drm_i915_private *dev_priv = dev->dev_private;
 203
 204         /* Ensure that all CPU reads are completed before installing a fence
 205          * and all writes before removing the fence.
 206          */
 207         if (i915_gem_object_needs_mb(dev_priv->fence_regs[reg].obj))
 208                 mb();
 209
 210         WARN(obj && (!obj->stride || !obj->tiling_mode),
 211              "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
 212              obj->stride, obj->tiling_mode);
 213
 214         if (IS_GEN2(dev))
 215                 i830_write_fence_reg(dev, reg, obj);
 216         else if (IS_GEN3(dev))
 217                 i915_write_fence_reg(dev, reg, obj);
 218         else if (INTEL_INFO(dev)->gen >= 4)
 219                 i965_write_fence_reg(dev, reg, obj);
 220
 221         /* And similarly be paranoid that no direct access to this region
 222          * is reordered to before the fence is installed.
 223          */
 224         if (i915_gem_object_needs_mb(obj))
 225                 mb();
 226 }
 227
 228 static inline int fence_number(struct drm_i915_private *dev_priv,
 229                                struct drm_i915_fence_reg *fence)
 230 {
 231         return fence - dev_priv->fence_regs;
 232 }
 233
 234 static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj,
 235                                          struct drm_i915_fence_reg *fence,
 236                                          bool enable)
 237 {
 238         struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 239         int reg = fence_number(dev_priv, fence);
 240
 241         i915_gem_write_fence(obj->base.dev, reg, enable ? obj : NULL);
 242
 243         if (enable) {
 244                 obj->fence_reg = reg;
 245                 fence->obj = obj;
 246                 list_move_tail(&fence->lru_list, &dev_priv->mm.fence_list);
 247         } else {
 248                 obj->fence_reg = I915_FENCE_REG_NONE;
 249                 fence->obj = NULL;
 250                 list_del_init(&fence->lru_list);
 251         }
 252         obj->fence_dirty = false;
 253 }
 254
 255 static inline void i915_gem_object_fence_lost(struct drm_i915_gem_object *obj)
 256 {
 257         if (obj->tiling_mode)
 258                 i915_gem_release_mmap(obj);
 259
 260         /* As we do not have an associated fence register, we will force
 261          * a tiling change if we ever need to acquire one.
 262          */
 263         obj->fence_dirty = false;
 264         obj->fence_reg = I915_FENCE_REG_NONE;
 265 }
 266
 267 static int
 268 i915_gem_object_wait_fence(struct drm_i915_gem_object *obj)
 269 {
 270         if (obj->last_fenced_req) {
 271                 int ret = i915_wait_request(obj->last_fenced_req);
 272                 if (ret)
 273                         return ret;
 274
 275                 i915_gem_request_assign(&obj->last_fenced_req, NULL);
 276         }
 277
 278         return 0;
 279 }
 280
 281 /**
 282  * i915_gem_object_put_fence - force-remove fence for an object
 283  * @obj: object to map through a fence reg
 284  *
 285  * This function force-removes any fence from the given object, which is useful
 286  * if the kernel wants to do untiled GTT access.
 287  *
 288  * Returns:
 289  *
 290  * 0 on success, negative error code on failure.
 291  */
 292 int
 293 i915_gem_object_put_fence(struct drm_i915_gem_object *obj)
 294 {
 295         struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 296         struct drm_i915_fence_reg *fence;
 297         int ret;
 298
 299         ret = i915_gem_object_wait_fence(obj);
 300         if (ret)
 301                 return ret;
 302
 303         if (obj->fence_reg == I915_FENCE_REG_NONE)
 304                 return 0;
 305
 306         fence = &dev_priv->fence_regs[obj->fence_reg];
 307
 308         if (WARN_ON(fence->pin_count))
 309                 return -EBUSY;
 310
 311         i915_gem_object_fence_lost(obj);
 312         i915_gem_object_update_fence(obj, fence, false);
 313
 314         return 0;
 315 }
 316
 317 static struct drm_i915_fence_reg *
 318 i915_find_fence_reg(struct drm_device *dev)
 319 {
 320         struct drm_i915_private *dev_priv = dev->dev_private;
 321         struct drm_i915_fence_reg *reg, *avail;
 322         int i;
 323
 324         /* First try to find a free reg */
 325         avail = NULL;
 326         for (i = dev_priv->fence_reg_start; i < dev_priv->num_fence_regs; i++) {
 327                 reg = &dev_priv->fence_regs[i];
 328                 if (!reg->obj)
 329                         return reg;
 330
 331                 if (!reg->pin_count)
 332                         avail = reg;
 333         }
 334
 335         if (avail == NULL)
 336                 goto deadlock;
 337
 338         /* None available, try to steal one or wait for a user to finish */
 339         list_for_each_entry(reg, &dev_priv->mm.fence_list, lru_list) {
 340                 if (reg->pin_count)
 341                         continue;
 342
 343                 return reg;
 344         }
 345
 346 deadlock:
 347         /* Wait for completion of pending flips which consume fences */
 348         if (intel_has_pending_fb_unpin(dev))
 349                 return ERR_PTR(-EAGAIN);
 350
 351         return ERR_PTR(-EDEADLK);
 352 }
 353
 354 /**
 355  * i915_gem_object_get_fence - set up fencing for an object
 356  * @obj: object to map through a fence reg
 357  *
 358  * When mapping objects through the GTT, userspace wants to be able to write
 359  * to them without having to worry about swizzling if the object is tiled.
 360  * This function walks the fence regs looking for a free one for @obj,
 361  * stealing one if it can't find any.
 362  *
 363  * It then sets up the reg based on the object's properties: address, pitch
 364  * and tiling format.
 365  *
 366  * For an untiled surface, this removes any existing fence.
 367  *
 368  * Returns:
 369  *
 370  * 0 on success, negative error code on failure.
 371  */
 372 int
 373 i915_gem_object_get_fence(struct drm_i915_gem_object *obj)
 374 {
 375         struct drm_device *dev = obj->base.dev;
 376         struct drm_i915_private *dev_priv = dev->dev_private;
 377         bool enable = obj->tiling_mode != I915_TILING_NONE;
 378         struct drm_i915_fence_reg *reg;
 379         int ret;
 380
 381         /* Have we updated the tiling parameters upon the object and so
 382          * will need to serialise the write to the associated fence register?
 383          */
 384         if (obj->fence_dirty) {
 385                 ret = i915_gem_object_wait_fence(obj);
 386                 if (ret)
 387                         return ret;
 388         }
 389
 390         /* Just update our place in the LRU if our fence is getting reused. */
 391         if (obj->fence_reg != I915_FENCE_REG_NONE) {
 392                 reg = &dev_priv->fence_regs[obj->fence_reg];
 393                 if (!obj->fence_dirty) {
 394                         list_move_tail(&reg->lru_list,
 395                                        &dev_priv->mm.fence_list);
 396                         return 0;
 397                 }
 398         } else if (enable) {
 399                 if (WARN_ON(!obj->map_and_fenceable))
 400                         return -EINVAL;
 401
 402                 reg = i915_find_fence_reg(dev);
 403                 if (IS_ERR(reg))
 404                         return PTR_ERR(reg);
 405
 406                 if (reg->obj) {
 407                         struct drm_i915_gem_object *old = reg->obj;
 408
 409                         ret = i915_gem_object_wait_fence(old);
 410                         if (ret)
 411                                 return ret;
 412
 413                         i915_gem_object_fence_lost(old);
 414                 }
 415         } else
 416                 return 0;
 417
 418         i915_gem_object_update_fence(obj, reg, enable);
 419
 420         return 0;
 421 }
 422
 423 /**
 424  * i915_gem_object_pin_fence - pin fencing state
 425  * @obj: object to pin fencing for
 426  *
 427  * This pins the fencing state (whether tiled or untiled) to make sure the
 428  * object is ready to be used as a scanout target. Fencing status must be
 429  * synchronize first by calling i915_gem_object_get_fence():
 430  *
 431  * The resulting fence pin reference must be released again with
 432  * i915_gem_object_unpin_fence().
 433  *
 434  * Returns:
 435  *
 436  * True if the object has a fence, false otherwise.
 437  */
 438 bool
 439 i915_gem_object_pin_fence(struct drm_i915_gem_object *obj)
 440 {
 441         if (obj->fence_reg != I915_FENCE_REG_NONE) {
 442                 struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 443                 struct i915_vma *ggtt_vma = i915_gem_obj_to_ggtt(obj);
 444
 445                 WARN_ON(!ggtt_vma ||
 446                         dev_priv->fence_regs[obj->fence_reg].pin_count >
 447                         ggtt_vma->pin_count);
 448                 dev_priv->fence_regs[obj->fence_reg].pin_count++;
 449                 return true;
 450         } else
 451                 return false;
 452 }
 453
 454 /**
 455  * i915_gem_object_unpin_fence - unpin fencing state
 456  * @obj: object to unpin fencing for
 457  *
 458  * This releases the fence pin reference acquired through
 459  * i915_gem_object_pin_fence. It will handle both objects with and without an
 460  * attached fence correctly, callers do not need to distinguish this.
 461  */
 462 void
 463 i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
 464 {
 465         if (obj->fence_reg != I915_FENCE_REG_NONE) {
 466                 struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 467                 WARN_ON(dev_priv->fence_regs[obj->fence_reg].pin_count <= 0);
 468                 dev_priv->fence_regs[obj->fence_reg].pin_count--;
 469         }
 470 }
 471
 472 /**
 473  * i915_gem_restore_fences - restore fence state
 474  * @dev: DRM device
 475  *
 476  * Restore the hw fence state to match the software tracking again, to be called
 477  * after a gpu reset and on resume.
 478  */
 479 void i915_gem_restore_fences(struct drm_device *dev)
 480 {
 481         struct drm_i915_private *dev_priv = dev->dev_private;
 482         int i;
 483
 484         for (i = 0; i < dev_priv->num_fence_regs; i++) {
 485                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
 486
 487                 /*
 488                  * Commit delayed tiling changes if we have an object still
 489                  * attached to the fence, otherwise just clear the fence.
 490                  */
 491                 if (reg->obj) {
 492                         i915_gem_object_update_fence(reg->obj, reg,
 493                                                      reg->obj->tiling_mode);
 494                 } else {
 495                         i915_gem_write_fence(dev, i, NULL);
 496                 }
 497         }
 498 }
 499
 500 /**
 501  * DOC: tiling swizzling details
 502  *
 503  * The idea behind tiling is to increase cache hit rates by rearranging
 504  * pixel data so that a group of pixel accesses are in the same cacheline.
 505  * Performance improvement from doing this on the back/depth buffer are on
 506  * the order of 30%.
 507  *
 508  * Intel architectures make this somewhat more complicated, though, by
 509  * adjustments made to addressing of data when the memory is in interleaved
 510  * mode (matched pairs of DIMMS) to improve memory bandwidth.
 511  * For interleaved memory, the CPU sends every sequential 64 bytes
 512  * to an alternate memory channel so it can get the bandwidth from both.
 513  *
 514  * The GPU also rearranges its accesses for increased bandwidth to interleaved
 515  * memory, and it matches what the CPU does for non-tiled.  However, when tiled
 516  * it does it a little differently, since one walks addresses not just in the
 517  * X direction but also Y.  So, along with alternating channels when bit
 518  * 6 of the address flips, it also alternates when other bits flip --  Bits 9
 519  * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
 520  * are common to both the 915 and 965-class hardware.
 521  *
 522  * The CPU also sometimes XORs in higher bits as well, to improve
 523  * bandwidth doing strided access like we do so frequently in graphics.  This
 524  * is called "Channel XOR Randomization" in the MCH documentation.  The result
 525  * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
 526  * decode.
 527  *
 528  * All of this bit 6 XORing has an effect on our memory management,
 529  * as we need to make sure that the 3d driver can correctly address object
 530  * contents.
 531  *
 532  * If we don't have interleaved memory, all tiling is safe and no swizzling is
 533  * required.
 534  *
 535  * When bit 17 is XORed in, we simply refuse to tile at all.  Bit
 536  * 17 is not just a page offset, so as we page an objet out and back in,
 537  * individual pages in it will have different bit 17 addresses, resulting in
 538  * each 64 bytes being swapped with its neighbor!
 539  *
 540  * Otherwise, if interleaved, we have to tell the 3d driver what the address
 541  * swizzling it needs to do is, since it's writing with the CPU to the pages
 542  * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
 543  * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
 544  * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
 545  * to match what the GPU expects.
 546  */
 547
 548 /**
 549  * i915_gem_detect_bit_6_swizzle - detect bit 6 swizzling pattern
 550  * @dev: DRM device
 551  *
 552  * Detects bit 6 swizzling of address lookup between IGD access and CPU
 553  * access through main memory.
 554  */
 555 void
 556 i915_gem_detect_bit_6_swizzle(struct drm_device *dev)
 557 {
 558         struct drm_i915_private *dev_priv = dev->dev_private;
 559         uint32_t swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
 560         uint32_t swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
 561
 562         if (INTEL_INFO(dev)->gen >= 8 || IS_VALLEYVIEW(dev)) {
 563                 /*
 564                  * On BDW+, swizzling is not used. We leave the CPU memory
 565                  * controller in charge of optimizing memory accesses without
 566                  * the extra address manipulation GPU side.
 567                  *
 568                  * VLV and CHV don't have GPU swizzling.
 569                  */
 570                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 571                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 572         } else if (INTEL_INFO(dev)->gen >= 6) {
 573                 if (dev_priv->preserve_bios_swizzle) {
 574                         if (I915_READ(DISP_ARB_CTL) &
 575                             DISP_TILE_SURFACE_SWIZZLING) {
 576                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 577                                 swizzle_y = I915_BIT_6_SWIZZLE_9;
 578                         } else {
 579                                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 580                                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 581                         }
 582                 } else {
 583                         uint32_t dimm_c0, dimm_c1;
 584                         dimm_c0 = I915_READ(MAD_DIMM_C0);
 585                         dimm_c1 = I915_READ(MAD_DIMM_C1);
 586                         dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
 587                         dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
 588                         /* Enable swizzling when the channels are populated
 589                          * with identically sized dimms. We don't need to check
 590                          * the 3rd channel because no cpu with gpu attached
 591                          * ships in that configuration. Also, swizzling only
 592                          * makes sense for 2 channels anyway. */
 593                         if (dimm_c0 == dimm_c1) {
 594                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 595                                 swizzle_y = I915_BIT_6_SWIZZLE_9;
 596                         } else {
 597                                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 598                                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 599                         }
 600                 }
 601         } else if (IS_GEN5(dev)) {
 602                 /* On Ironlake whatever DRAM config, GPU always do
 603                  * same swizzling setup.
 604                  */
 605                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 606                 swizzle_y = I915_BIT_6_SWIZZLE_9;
 607         } else if (IS_GEN2(dev)) {
 608                 /* As far as we know, the 865 doesn't have these bit 6
 609                  * swizzling issues.
 610                  */
 611                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 612                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 613         } else if (IS_MOBILE(dev) || (IS_GEN3(dev) && !IS_G33(dev))) {
 614                 uint32_t dcc;
 615
 616                 /* On 9xx chipsets, channel interleave by the CPU is
 617                  * determined by DCC.  For single-channel, neither the CPU
 618                  * nor the GPU do swizzling.  For dual channel interleaved,
 619                  * the GPU's interleave is bit 9 and 10 for X tiled, and bit
 620                  * 9 for Y tiled.  The CPU's interleave is independent, and
 621                  * can be based on either bit 11 (haven't seen this yet) or
 622                  * bit 17 (common).
 623                  */
 624                 dcc = I915_READ(DCC);
 625                 switch (dcc & DCC_ADDRESSING_MODE_MASK) {
 626                 case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
 627                 case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
 628                         swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 629                         swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 630                         break;
 631                 case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
 632                         if (dcc & DCC_CHANNEL_XOR_DISABLE) {
 633                                 /* This is the base swizzling by the GPU for
 634                                  * tiled buffers.
 635                                  */
 636                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 637                                 swizzle_y = I915_BIT_6_SWIZZLE_9;
 638                         } else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) {
 639                                 /* Bit 11 swizzling by the CPU in addition. */
 640                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
 641                                 swizzle_y = I915_BIT_6_SWIZZLE_9_11;
 642                         } else {
 643                                 /* Bit 17 swizzling by the CPU in addition. */
 644                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
 645                                 swizzle_y = I915_BIT_6_SWIZZLE_9_17;
 646                         }
 647                         break;
 648                 }
 649
 650                 /* check for L-shaped memory aka modified enhanced addressing */
 651                 if (IS_GEN4(dev)) {
 652                         uint32_t ddc2 = I915_READ(DCC2);
 653
 654                         if (!(ddc2 & DCC2_MODIFIED_ENHANCED_DISABLE))
 655                                 dev_priv->quirks |= QUIRK_PIN_SWIZZLED_PAGES;
 656                 }
 657
 658                 if (dcc == 0xffffffff) {
 659                         DRM_ERROR("Couldn't read from MCHBAR.  "
 660                                   "Disabling tiling.\n");
 661                         swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
 662                         swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
 663                 }
 664         } else {
 665                 /* The 965, G33, and newer, have a very flexible memory
 666                  * configuration.  It will enable dual-channel mode
 667                  * (interleaving) on as much memory as it can, and the GPU
 668                  * will additionally sometimes enable different bit 6
 669                  * swizzling for tiled objects from the CPU.
 670                  *
 671                  * Here's what I found on the G965:
 672                  *    slot fill         memory size  swizzling
 673                  * 0A   0B   1A   1B    1-ch   2-ch
 674                  * 512  0    0    0     512    0     O
 675                  * 512  0    512  0     16     1008  X
 676                  * 512  0    0    512   16     1008  X
 677                  * 0    512  0    512   16     1008  X
 678                  * 1024 1024 1024 0     2048   1024  O
 679                  *
 680                  * We could probably detect this based on either the DRB
 681                  * matching, which was the case for the swizzling required in
 682                  * the table above, or from the 1-ch value being less than
 683                  * the minimum size of a rank.
 684                  */
 685                 if (I915_READ16(C0DRB3) != I915_READ16(C1DRB3)) {
 686                         swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 687                         swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 688                 } else {
 689                         swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 690                         swizzle_y = I915_BIT_6_SWIZZLE_9;
 691                 }
 692         }
 693
 694         dev_priv->mm.bit_6_swizzle_x = swizzle_x;
 695         dev_priv->mm.bit_6_swizzle_y = swizzle_y;
 696 }
 697
 698 /*
 699  * Swap every 64 bytes of this page around, to account for it having a new
 700  * bit 17 of its physical address and therefore being interpreted differently
 701  * by the GPU.
 702  */
 703 static void
 704 i915_gem_swizzle_page(struct vm_page *page)
 705 {
 706         char temp[64];
 707         char *vaddr;
 708         int i;
 709
 710         vaddr = kmap(page);
 711
 712         for (i = 0; i < PAGE_SIZE; i += 128) {
 713                 memcpy(temp, &vaddr[i], 64);
 714                 memcpy(&vaddr[i], &vaddr[i + 64], 64);
 715                 memcpy(&vaddr[i + 64], temp, 64);
 716         }
 717
 718         kunmap(page);
 719 }
 720
 721 /**
 722  * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
 723  * @obj: i915 GEM buffer object
 724  *
 725  * This function fixes up the swizzling in case any page frame number for this
 726  * object has changed in bit 17 since that state has been saved with
 727  * i915_gem_object_save_bit_17_swizzle().
 728  *
 729  * This is called when pinning backing storage again, since the kernel is free
 730  * to move unpinned backing storage around (either by directly moving pages or
 731  * by swapping them out and back in again).
 732  */
 733 void
 734 i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj)
 735 {
 736         struct sg_page_iter sg_iter;
 737         int i;
 738
 739         if (obj->bit_17 == NULL)
 740                 return;
 741
 742         i = 0;
 743         for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
 744                 struct vm_page *page = sg_page_iter_page(&sg_iter);
 745                 char new_bit_17 = page_to_phys(page) >> 17;
 746                 if ((new_bit_17 & 0x1) !=
 747                     (test_bit(i, obj->bit_17) != 0)) {
 748                         i915_gem_swizzle_page(page);
 749                         set_page_dirty(page);
 750                 }
 751                 i++;
 752         }
 753 }
 754
 755 /**
 756  * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
 757  * @obj: i915 GEM buffer object
 758  *
 759  * This function saves the bit 17 of each page frame number so that swizzling
 760  * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
 761  * be called before the backing storage can be unpinned.
 762  */
 763 void
 764 i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj)
 765 {
 766         struct sg_page_iter sg_iter;
 767         int page_count = obj->base.size >> PAGE_SHIFT;
 768         int i;
 769
 770         if (obj->bit_17 == NULL) {
 771                 obj->bit_17 = kcalloc(BITS_TO_LONGS(page_count),
 772                                       sizeof(long), GFP_KERNEL);
 773                 if (obj->bit_17 == NULL) {
 774                         DRM_ERROR("Failed to allocate memory for bit 17 "
 775                                   "record\n");
 776                         return;
 777                 }
 778         }
 779
 780         i = 0;
 781         for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
 782                 if (page_to_phys(sg_page_iter_page(&sg_iter)) & (1 << 17))
 783                         __set_bit(i, obj->bit_17);
 784                 else
 785                         __clear_bit(i, obj->bit_17);
 786                 i++;
 787         }
 788 }