fs/ocfs2/move_extents.c

   1 /* -*- mode: c; c-basic-offset: 8; -*-
   2  * vim: noexpandtab sw=8 ts=8 sts=0:
   3  *
   4  * move_extents.c
   5  *
   6  * Copyright (C) 2011 Oracle.  All rights reserved.
   7  *
   8  * This program is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public
  10  * License version 2 as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License for more details.
  16  */
  17 #include <linux/fs.h>
  18 #include <linux/types.h>
  19 #include <linux/mount.h>
  20 #include <linux/swap.h>
  21
  22 #include <cluster/masklog.h>
  23
  24 #include "ocfs2.h"
  25 #include "ocfs2_ioctl.h"
  26
  27 #include "alloc.h"
  28 #include "aops.h"
  29 #include "dlmglue.h"
  30 #include "extent_map.h"
  31 #include "inode.h"
  32 #include "journal.h"
  33 #include "suballoc.h"
  34 #include "uptodate.h"
  35 #include "super.h"
  36 #include "dir.h"
  37 #include "buffer_head_io.h"
  38 #include "sysfile.h"
  39 #include "suballoc.h"
  40 #include "refcounttree.h"
  41 #include "move_extents.h"
  42
  43 struct ocfs2_move_extents_context {
  44         struct inode *inode;
  45         struct file *file;
  46         int auto_defrag;
  47         int partial;
  48         int credits;
  49         u32 new_phys_cpos;
  50         u32 clusters_moved;
  51         u64 refcount_loc;
  52         struct ocfs2_move_extents *range;
  53         struct ocfs2_extent_tree et;
  54         struct ocfs2_alloc_context *meta_ac;
  55         struct ocfs2_alloc_context *data_ac;
  56         struct ocfs2_cached_dealloc_ctxt dealloc;
  57 };
  58
  59 static int __ocfs2_move_extent(handle_t *handle,
  60                                struct ocfs2_move_extents_context *context,
  61                                u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
  62                                int ext_flags)
  63 {
  64         int ret = 0, index;
  65         struct inode *inode = context->inode;
  66         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  67         struct ocfs2_extent_rec *rec, replace_rec;
  68         struct ocfs2_path *path = NULL;
  69         struct ocfs2_extent_list *el;
  70         u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
  71         u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
  72
  73         ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
  74                                                p_cpos, new_p_cpos, len);
  75         if (ret) {
  76                 mlog_errno(ret);
  77                 goto out;
  78         }
  79
  80         memset(&replace_rec, 0, sizeof(replace_rec));
  81         replace_rec.e_cpos = cpu_to_le32(cpos);
  82         replace_rec.e_leaf_clusters = cpu_to_le16(len);
  83         replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
  84                                                                    new_p_cpos));
  85
  86         path = ocfs2_new_path_from_et(&context->et);
  87         if (!path) {
  88                 ret = -ENOMEM;
  89                 mlog_errno(ret);
  90                 goto out;
  91         }
  92
  93         ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
  94         if (ret) {
  95                 mlog_errno(ret);
  96                 goto out;
  97         }
  98
  99         el = path_leaf_el(path);
 100
 101         index = ocfs2_search_extent_list(el, cpos);
 102         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
 103                 ocfs2_error(inode->i_sb,
 104                             "Inode %llu has an extent at cpos %u which can no "
 105                             "longer be found.\n",
 106                             (unsigned long long)ino, cpos);
 107                 ret = -EROFS;
 108                 goto out;
 109         }
 110
 111         rec = &el->l_recs[index];
 112
 113         BUG_ON(ext_flags != rec->e_flags);
 114         /*
 115          * after moving/defraging to new location, the extent is not going
 116          * to be refcounted anymore.
 117          */
 118         replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
 119
 120         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
 121                                       context->et.et_root_bh,
 122                                       OCFS2_JOURNAL_ACCESS_WRITE);
 123         if (ret) {
 124                 mlog_errno(ret);
 125                 goto out;
 126         }
 127
 128         ret = ocfs2_split_extent(handle, &context->et, path, index,
 129                                  &replace_rec, context->meta_ac,
 130                                  &context->dealloc);
 131         if (ret) {
 132                 mlog_errno(ret);
 133                 goto out;
 134         }
 135
 136         ocfs2_journal_dirty(handle, context->et.et_root_bh);
 137
 138         context->new_phys_cpos = new_p_cpos;
 139
 140         /*
 141          * need I to append truncate log for old clusters?
 142          */
 143         if (old_blkno) {
 144                 if (ext_flags & OCFS2_EXT_REFCOUNTED)
 145                         ret = ocfs2_decrease_refcount(inode, handle,
 146                                         ocfs2_blocks_to_clusters(osb->sb,
 147                                                                  old_blkno),
 148                                         len, context->meta_ac,
 149                                         &context->dealloc, 1);
 150                 else
 151                         ret = ocfs2_truncate_log_append(osb, handle,
 152                                                         old_blkno, len);
 153         }
 154
 155 out:
 156         return ret;
 157 }
 158
 159 /*
 160  * lock allocators, and reserving appropriate number of bits for
 161  * meta blocks and data clusters.
 162  *
 163  * in some cases, we don't need to reserve clusters, just let data_ac
 164  * be NULL.
 165  */
 166 static int ocfs2_lock_allocators_move_extents(struct inode *inode,
 167                                         struct ocfs2_extent_tree *et,
 168                                         u32 clusters_to_move,
 169                                         u32 extents_to_split,
 170                                         struct ocfs2_alloc_context **meta_ac,
 171                                         struct ocfs2_alloc_context **data_ac,
 172                                         int extra_blocks,
 173                                         int *credits)
 174 {
 175         int ret, num_free_extents;
 176         unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
 177         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 178
 179         num_free_extents = ocfs2_num_free_extents(osb, et);
 180         if (num_free_extents < 0) {
 181                 ret = num_free_extents;
 182                 mlog_errno(ret);
 183                 goto out;
 184         }
 185
 186         if (!num_free_extents ||
 187             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
 188                 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
 189
 190         ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
 191         if (ret) {
 192                 mlog_errno(ret);
 193                 goto out;
 194         }
 195
 196         if (data_ac) {
 197                 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
 198                 if (ret) {
 199                         mlog_errno(ret);
 200                         goto out;
 201                 }
 202         }
 203
 204         *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
 205                                               clusters_to_move + 2);
 206
 207         mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
 208              extra_blocks, clusters_to_move, *credits);
 209 out:
 210         if (ret) {
 211                 if (*meta_ac) {
 212                         ocfs2_free_alloc_context(*meta_ac);
 213                         *meta_ac = NULL;
 214                 }
 215         }
 216
 217         return ret;
 218 }
 219
 220 /*
 221  * Using one journal handle to guarantee the data consistency in case
 222  * crash happens anywhere.
 223  *
 224  *  XXX: defrag can end up with finishing partial extent as requested,
 225  * due to not enough contiguous clusters can be found in allocator.
 226  */
 227 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 228                                u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
 229 {
 230         int ret, credits = 0, extra_blocks = 0, partial = context->partial;
 231         handle_t *handle;
 232         struct inode *inode = context->inode;
 233         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 234         struct inode *tl_inode = osb->osb_tl_inode;
 235         struct ocfs2_refcount_tree *ref_tree = NULL;
 236         u32 new_phys_cpos, new_len;
 237         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 238
 239         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
 240
 241                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
 242                          OCFS2_HAS_REFCOUNT_FL));
 243
 244                 BUG_ON(!context->refcount_loc);
 245
 246                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
 247                                                &ref_tree, NULL);
 248                 if (ret) {
 249                         mlog_errno(ret);
 250                         return ret;
 251                 }
 252
 253                 ret = ocfs2_prepare_refcount_change_for_del(inode,
 254                                                         context->refcount_loc,
 255                                                         phys_blkno,
 256                                                         *len,
 257                                                         &credits,
 258                                                         &extra_blocks);
 259                 if (ret) {
 260                         mlog_errno(ret);
 261                         goto out;
 262                 }
 263         }
 264
 265         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
 266                                                  &context->meta_ac,
 267                                                  &context->data_ac,
 268                                                  extra_blocks, &credits);
 269         if (ret) {
 270                 mlog_errno(ret);
 271                 goto out;
 272         }
 273
 274         /*
 275          * should be using allocation reservation strategy there?
 276          *
 277          * if (context->data_ac)
 278          *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
 279          */
 280
 281         mutex_lock(&tl_inode->i_mutex);
 282
 283         if (ocfs2_truncate_log_needs_flush(osb)) {
 284                 ret = __ocfs2_flush_truncate_log(osb);
 285                 if (ret < 0) {
 286                         mlog_errno(ret);
 287                         goto out_unlock_mutex;
 288                 }
 289         }
 290
 291         handle = ocfs2_start_trans(osb, credits);
 292         if (IS_ERR(handle)) {
 293                 ret = PTR_ERR(handle);
 294                 mlog_errno(ret);
 295                 goto out_unlock_mutex;
 296         }
 297
 298         ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
 299                                      &new_phys_cpos, &new_len);
 300         if (ret) {
 301                 mlog_errno(ret);
 302                 goto out_commit;
 303         }
 304
 305         /*
 306          * allowing partial extent moving is kind of 'pros and cons', it makes
 307          * whole defragmentation less likely to fail, on the contrary, the bad
 308          * thing is it may make the fs even more fragmented after moving, let
 309          * userspace make a good decision here.
 310          */
 311         if (new_len != *len) {
 312                 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
 313                 if (!partial) {
 314                         context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
 315                         ret = -ENOSPC;
 316                         goto out_commit;
 317                 }
 318         }
 319
 320         mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
 321              phys_cpos, new_phys_cpos);
 322
 323         ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
 324                                   new_phys_cpos, ext_flags);
 325         if (ret)
 326                 mlog_errno(ret);
 327
 328         if (partial && (new_len != *len))
 329                 *len = new_len;
 330
 331         /*
 332          * Here we should write the new page out first if we are
 333          * in write-back mode.
 334          */
 335         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
 336         if (ret)
 337                 mlog_errno(ret);
 338
 339 out_commit:
 340         ocfs2_commit_trans(osb, handle);
 341
 342 out_unlock_mutex:
 343         mutex_unlock(&tl_inode->i_mutex);
 344
 345         if (context->data_ac) {
 346                 ocfs2_free_alloc_context(context->data_ac);
 347                 context->data_ac = NULL;
 348         }
 349
 350         if (context->meta_ac) {
 351                 ocfs2_free_alloc_context(context->meta_ac);
 352                 context->meta_ac = NULL;
 353         }
 354
 355 out:
 356         if (ref_tree)
 357                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 358
 359         return ret;
 360 }
 361
 362 /*
 363  * find the victim alloc group, where #blkno fits.
 364  */
 365 static int ocfs2_find_victim_alloc_group(struct inode *inode,
 366                                          u64 vict_blkno,
 367                                          int type, int slot,
 368                                          int *vict_bit,
 369                                          struct buffer_head **ret_bh)
 370 {
 371         int ret, i, bits_per_unit = 0;
 372         u64 blkno;
 373         char namebuf[40];
 374
 375         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 376         struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
 377         struct ocfs2_chain_list *cl;
 378         struct ocfs2_chain_rec *rec;
 379         struct ocfs2_dinode *ac_dinode;
 380         struct ocfs2_group_desc *bg;
 381
 382         ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
 383         ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
 384                                          strlen(namebuf), &blkno);
 385         if (ret) {
 386                 ret = -ENOENT;
 387                 goto out;
 388         }
 389
 390         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
 391         if (ret) {
 392                 mlog_errno(ret);
 393                 goto out;
 394         }
 395
 396         ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
 397         cl = &(ac_dinode->id2.i_chain);
 398         rec = &(cl->cl_recs[0]);
 399
 400         if (type == GLOBAL_BITMAP_SYSTEM_INODE)
 401                 bits_per_unit = osb->s_clustersize_bits -
 402                                         inode->i_sb->s_blocksize_bits;
 403         /*
 404          * 'vict_blkno' was out of the valid range.
 405          */
 406         if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
 407             (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
 408                                 bits_per_unit))) {
 409                 ret = -EINVAL;
 410                 goto out;
 411         }
 412
 413         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
 414
 415                 rec = &(cl->cl_recs[i]);
 416                 if (!rec)
 417                         continue;
 418
 419                 bg = NULL;
 420
 421                 do {
 422                         if (!bg)
 423                                 blkno = le64_to_cpu(rec->c_blkno);
 424                         else
 425                                 blkno = le64_to_cpu(bg->bg_next_group);
 426
 427                         if (gd_bh) {
 428                                 brelse(gd_bh);
 429                                 gd_bh = NULL;
 430                         }
 431
 432                         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
 433                         if (ret) {
 434                                 mlog_errno(ret);
 435                                 goto out;
 436                         }
 437
 438                         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 439
 440                         if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
 441                                                 le16_to_cpu(bg->bg_bits))) {
 442
 443                                 *ret_bh = gd_bh;
 444                                 *vict_bit = (vict_blkno - blkno) >>
 445                                                         bits_per_unit;
 446                                 mlog(0, "find the victim group: #%llu, "
 447                                      "total_bits: %u, vict_bit: %u\n",
 448                                      blkno, le16_to_cpu(bg->bg_bits),
 449                                      *vict_bit);
 450                                 goto out;
 451                         }
 452
 453                 } while (le64_to_cpu(bg->bg_next_group));
 454         }
 455
 456         ret = -EINVAL;
 457 out:
 458         brelse(ac_bh);
 459
 460         /*
 461          * caller has to release the gd_bh properly.
 462          */
 463         return ret;
 464 }
 465
 466 /*
 467  * XXX: helper to validate and adjust moving goal.
 468  */
 469 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
 470                                                struct ocfs2_move_extents *range)
 471 {
 472         int ret, goal_bit = 0;
 473
 474         struct buffer_head *gd_bh = NULL;
 475         struct ocfs2_group_desc *bg = NULL;
 476         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 477         int c_to_b = 1 << (osb->s_clustersize_bits -
 478                                         inode->i_sb->s_blocksize_bits);
 479
 480         /*
 481          * make goal become cluster aligned.
 482          */
 483         range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
 484                                                       range->me_goal);
 485         /*
 486          * moving goal is not allowd to start with a group desc blok(#0 blk)
 487          * let's compromise to the latter cluster.
 488          */
 489         if (range->me_goal == le64_to_cpu(bg->bg_blkno))
 490                 range->me_goal += c_to_b;
 491
 492         /*
 493          * validate goal sits within global_bitmap, and return the victim
 494          * group desc
 495          */
 496         ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
 497                                             GLOBAL_BITMAP_SYSTEM_INODE,
 498                                             OCFS2_INVALID_SLOT,
 499                                             &goal_bit, &gd_bh);
 500         if (ret)
 501                 goto out;
 502
 503         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 504
 505         /*
 506          * movement is not gonna cross two groups.
 507          */
 508         if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
 509                                                                 range->me_len) {
 510                 ret = -EINVAL;
 511                 goto out;
 512         }
 513         /*
 514          * more exact validations/adjustments will be performed later during
 515          * moving operation for each extent range.
 516          */
 517         mlog(0, "extents get ready to be moved to #%llu block\n",
 518              range->me_goal);
 519
 520 out:
 521         brelse(gd_bh);
 522
 523         return ret;
 524 }
 525
 526 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
 527                                     int *goal_bit, u32 move_len, u32 max_hop,
 528                                     u32 *phys_cpos)
 529 {
 530         int i, used, last_free_bits = 0, base_bit = *goal_bit;
 531         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 532         u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
 533                                                  le64_to_cpu(gd->bg_blkno));
 534
 535         for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
 536
 537                 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
 538                 if (used) {
 539                         /*
 540                          * we even tried searching the free chunk by jumping
 541                          * a 'max_hop' distance, but still failed.
 542                          */
 543                         if ((i - base_bit) > max_hop) {
 544                                 *phys_cpos = 0;
 545                                 break;
 546                         }
 547
 548                         if (last_free_bits)
 549                                 last_free_bits = 0;
 550
 551                         continue;
 552                 } else
 553                         last_free_bits++;
 554
 555                 if (last_free_bits == move_len) {
 556                         *goal_bit = i;
 557                         *phys_cpos = base_cpos + i;
 558                         break;
 559                 }
 560         }
 561
 562         mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
 563 }
 564
 565 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 566                                        handle_t *handle,
 567                                        struct buffer_head *di_bh,
 568                                        u32 num_bits,
 569                                        u16 chain)
 570 {
 571         int ret;
 572         u32 tmp_used;
 573         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 574         struct ocfs2_chain_list *cl =
 575                                 (struct ocfs2_chain_list *) &di->id2.i_chain;
 576
 577         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 578                                       OCFS2_JOURNAL_ACCESS_WRITE);
 579         if (ret < 0) {
 580                 mlog_errno(ret);
 581                 goto out;
 582         }
 583
 584         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
 585         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
 586         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
 587         ocfs2_journal_dirty(handle, di_bh);
 588
 589 out:
 590         return ret;
 591 }
 592
 593 static inline int ocfs2_block_group_set_bits(handle_t *handle,
 594                                              struct inode *alloc_inode,
 595                                              struct ocfs2_group_desc *bg,
 596                                              struct buffer_head *group_bh,
 597                                              unsigned int bit_off,
 598                                              unsigned int num_bits)
 599 {
 600         int status;
 601         void *bitmap = bg->bg_bitmap;
 602         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 603
 604         /* All callers get the descriptor via
 605          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 606         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 607         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 608
 609         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
 610              num_bits);
 611
 612         if (ocfs2_is_cluster_bitmap(alloc_inode))
 613                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 614
 615         status = ocfs2_journal_access_gd(handle,
 616                                          INODE_CACHE(alloc_inode),
 617                                          group_bh,
 618                                          journal_type);
 619         if (status < 0) {
 620                 mlog_errno(status);
 621                 goto bail;
 622         }
 623
 624         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
 625         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
 626                 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
 627                             " count %u but claims %u are freed. num_bits %d",
 628                             (unsigned long long)le64_to_cpu(bg->bg_blkno),
 629                             le16_to_cpu(bg->bg_bits),
 630                             le16_to_cpu(bg->bg_free_bits_count), num_bits);
 631                 return -EROFS;
 632         }
 633         while (num_bits--)
 634                 ocfs2_set_bit(bit_off++, bitmap);
 635
 636         ocfs2_journal_dirty(handle, group_bh);
 637
 638 bail:
 639         return status;
 640 }
 641
 642 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 643                              u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
 644                              u32 len, int ext_flags)
 645 {
 646         int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
 647         handle_t *handle;
 648         struct inode *inode = context->inode;
 649         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 650         struct inode *tl_inode = osb->osb_tl_inode;
 651         struct inode *gb_inode = NULL;
 652         struct buffer_head *gb_bh = NULL;
 653         struct buffer_head *gd_bh = NULL;
 654         struct ocfs2_group_desc *gd;
 655         struct ocfs2_refcount_tree *ref_tree = NULL;
 656         u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
 657                                                     context->range->me_threshold);
 658         u64 phys_blkno, new_phys_blkno;
 659
 660         phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 661
 662         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
 663
 664                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
 665                          OCFS2_HAS_REFCOUNT_FL));
 666
 667                 BUG_ON(!context->refcount_loc);
 668
 669                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
 670                                                &ref_tree, NULL);
 671                 if (ret) {
 672                         mlog_errno(ret);
 673                         return ret;
 674                 }
 675
 676                 ret = ocfs2_prepare_refcount_change_for_del(inode,
 677                                                         context->refcount_loc,
 678                                                         phys_blkno,
 679                                                         len,
 680                                                         &credits,
 681                                                         &extra_blocks);
 682                 if (ret) {
 683                         mlog_errno(ret);
 684                         goto out;
 685                 }
 686         }
 687
 688         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
 689                                                  &context->meta_ac,
 690                                                  NULL, extra_blocks, &credits);
 691         if (ret) {
 692                 mlog_errno(ret);
 693                 goto out;
 694         }
 695
 696         /*
 697          * need to count 2 extra credits for global_bitmap inode and
 698          * group descriptor.
 699          */
 700         credits += OCFS2_INODE_UPDATE_CREDITS + 1;
 701
 702         /*
 703          * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
 704          * logic, while we still need to lock the global_bitmap.
 705          */
 706         gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
 707                                                OCFS2_INVALID_SLOT);
 708         if (!gb_inode) {
 709                 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
 710                 ret = -EIO;
 711                 goto out;
 712         }
 713
 714         mutex_lock(&gb_inode->i_mutex);
 715
 716         ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
 717         if (ret) {
 718                 mlog_errno(ret);
 719                 goto out_unlock_gb_mutex;
 720         }
 721
 722         mutex_lock(&tl_inode->i_mutex);
 723
 724         handle = ocfs2_start_trans(osb, credits);
 725         if (IS_ERR(handle)) {
 726                 ret = PTR_ERR(handle);
 727                 mlog_errno(ret);
 728                 goto out_unlock_tl_inode;
 729         }
 730
 731         new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
 732         ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
 733                                             GLOBAL_BITMAP_SYSTEM_INODE,
 734                                             OCFS2_INVALID_SLOT,
 735                                             &goal_bit, &gd_bh);
 736         if (ret) {
 737                 mlog_errno(ret);
 738                 goto out_commit;
 739         }
 740
 741         /*
 742          * probe the victim cluster group to find a proper
 743          * region to fit wanted movement, it even will perfrom
 744          * a best-effort attempt by compromising to a threshold
 745          * around the goal.
 746          */
 747         ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
 748                                 new_phys_cpos);
 749         if (!new_phys_cpos) {
 750                 ret = -ENOSPC;
 751                 goto out_commit;
 752         }
 753
 754         ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
 755                                   *new_phys_cpos, ext_flags);
 756         if (ret) {
 757                 mlog_errno(ret);
 758                 goto out_commit;
 759         }
 760
 761         gd = (struct ocfs2_group_desc *)gd_bh->b_data;
 762         ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
 763                                                le16_to_cpu(gd->bg_chain));
 764         if (ret) {
 765                 mlog_errno(ret);
 766                 goto out_commit;
 767         }
 768
 769         ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
 770                                          goal_bit, len);
 771         if (ret)
 772                 mlog_errno(ret);
 773
 774         /*
 775          * Here we should write the new page out first if we are
 776          * in write-back mode.
 777          */
 778         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
 779         if (ret)
 780                 mlog_errno(ret);
 781
 782 out_commit:
 783         ocfs2_commit_trans(osb, handle);
 784         brelse(gd_bh);
 785
 786 out_unlock_tl_inode:
 787         mutex_unlock(&tl_inode->i_mutex);
 788
 789         ocfs2_inode_unlock(gb_inode, 1);
 790 out_unlock_gb_mutex:
 791         mutex_unlock(&gb_inode->i_mutex);
 792         brelse(gb_bh);
 793         iput(gb_inode);
 794
 795 out:
 796         if (context->meta_ac) {
 797                 ocfs2_free_alloc_context(context->meta_ac);
 798                 context->meta_ac = NULL;
 799         }
 800
 801         if (ref_tree)
 802                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 803
 804         return ret;
 805 }
 806
 807 /*
 808  * Helper to calculate the defraging length in one run according to threshold.
 809  */
 810 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
 811                                          u32 threshold, int *skip)
 812 {
 813         if ((*alloc_size + *len_defraged) < threshold) {
 814                 /*
 815                  * proceed defragmentation until we meet the thresh
 816                  */
 817                 *len_defraged += *alloc_size;
 818         } else if (*len_defraged == 0) {
 819                 /*
 820                  * XXX: skip a large extent.
 821                  */
 822                 *skip = 1;
 823         } else {
 824                 /*
 825                  * split this extent to coalesce with former pieces as
 826                  * to reach the threshold.
 827                  *
 828                  * we're done here with one cycle of defragmentation
 829                  * in a size of 'thresh', resetting 'len_defraged'
 830                  * forces a new defragmentation.
 831                  */
 832                 *alloc_size = threshold - *len_defraged;
 833                 *len_defraged = 0;
 834         }
 835 }
 836
 837 static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
 838                                 struct ocfs2_move_extents_context *context)
 839 {
 840         int ret = 0, flags, do_defrag, skip = 0;
 841         u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
 842         u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
 843
 844         struct inode *inode = context->inode;
 845         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 846         struct ocfs2_move_extents *range = context->range;
 847         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 848
 849         if ((inode->i_size == 0) || (range->me_len == 0))
 850                 return 0;
 851
 852         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 853                 return 0;
 854
 855         context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
 856
 857         ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
 858         ocfs2_init_dealloc_ctxt(&context->dealloc);
 859
 860         /*
 861          * TO-DO XXX:
 862          *
 863          * - xattr extents.
 864          */
 865
 866         do_defrag = context->auto_defrag;
 867
 868         /*
 869          * extents moving happens in unit of clusters, for the sake
 870          * of simplicity, we may ignore two clusters where 'byte_start'
 871          * and 'byte_start + len' were within.
 872          */
 873         move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
 874         len_to_move = (range->me_start + range->me_len) >>
 875                                                 osb->s_clustersize_bits;
 876         if (len_to_move >= move_start)
 877                 len_to_move -= move_start;
 878         else
 879                 len_to_move = 0;
 880
 881         if (do_defrag) {
 882                 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
 883                 if (defrag_thresh <= 1)
 884                         goto done;
 885         } else
 886                 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
 887                                                          range->me_goal);
 888
 889         mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
 890              "thresh: %u\n",
 891              (unsigned long long)OCFS2_I(inode)->ip_blkno,
 892              (unsigned long long)range->me_start,
 893              (unsigned long long)range->me_len,
 894              move_start, len_to_move, defrag_thresh);
 895
 896         cpos = move_start;
 897         while (len_to_move) {
 898                 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
 899                                          &flags);
 900                 if (ret) {
 901                         mlog_errno(ret);
 902                         goto out;
 903                 }
 904
 905                 if (alloc_size > len_to_move)
 906                         alloc_size = len_to_move;
 907
 908                 /*
 909                  * XXX: how to deal with a hole:
 910                  *
 911                  * - skip the hole of course
 912                  * - force a new defragmentation
 913                  */
 914                 if (!phys_cpos) {
 915                         if (do_defrag)
 916                                 len_defraged = 0;
 917
 918                         goto next;
 919                 }
 920
 921                 if (do_defrag) {
 922                         ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
 923                                                      defrag_thresh, &skip);
 924                         /*
 925                          * skip large extents
 926                          */
 927                         if (skip) {
 928                                 skip = 0;
 929                                 goto next;
 930                         }
 931
 932                         mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
 933                              "alloc_size: %u, len_defraged: %u\n",
 934                              cpos, phys_cpos, alloc_size, len_defraged);
 935
 936                         ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
 937                                                   &alloc_size, flags);
 938                 } else {
 939                         ret = ocfs2_move_extent(context, cpos, phys_cpos,
 940                                                 &new_phys_cpos, alloc_size,
 941                                                 flags);
 942
 943                         new_phys_cpos += alloc_size;
 944                 }
 945
 946                 if (ret < 0) {
 947                         mlog_errno(ret);
 948                         goto out;
 949                 }
 950
 951                 context->clusters_moved += alloc_size;
 952 next:
 953                 cpos += alloc_size;
 954                 len_to_move -= alloc_size;
 955         }
 956
 957 done:
 958         range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
 959
 960 out:
 961         range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
 962                                                       context->clusters_moved);
 963         range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
 964                                                        context->new_phys_cpos);
 965
 966         ocfs2_schedule_truncate_log_flush(osb, 1);
 967         ocfs2_run_deallocs(osb, &context->dealloc);
 968
 969         return ret;
 970 }
 971
 972 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 973 {
 974         int status;
 975         handle_t *handle;
 976         struct inode *inode = context->inode;
 977         struct ocfs2_dinode *di;
 978         struct buffer_head *di_bh = NULL;
 979         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 980
 981         if (!inode)
 982                 return -ENOENT;
 983
 984         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 985                 return -EROFS;
 986
 987         mutex_lock(&inode->i_mutex);
 988
 989         /*
 990          * This prevents concurrent writes from other nodes
 991          */
 992         status = ocfs2_rw_lock(inode, 1);
 993         if (status) {
 994                 mlog_errno(status);
 995                 goto out;
 996         }
 997
 998         status = ocfs2_inode_lock(inode, &di_bh, 1);
 999         if (status) {
1000                 mlog_errno(status);
1001                 goto out_rw_unlock;
1002         }
1003
1004         /*
1005          * rememer ip_xattr_sem also needs to be held if necessary
1006          */
1007         down_write(&OCFS2_I(inode)->ip_alloc_sem);
1008
1009         status = __ocfs2_move_extents_range(di_bh, context);
1010
1011         up_write(&OCFS2_I(inode)->ip_alloc_sem);
1012         if (status) {
1013                 mlog_errno(status);
1014                 goto out_inode_unlock;
1015         }
1016
1017         /*
1018          * We update ctime for these changes
1019          */
1020         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1021         if (IS_ERR(handle)) {
1022                 status = PTR_ERR(handle);
1023                 mlog_errno(status);
1024                 goto out_inode_unlock;
1025         }
1026
1027         status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1028                                          OCFS2_JOURNAL_ACCESS_WRITE);
1029         if (status) {
1030                 mlog_errno(status);
1031                 goto out_commit;
1032         }
1033
1034         di = (struct ocfs2_dinode *)di_bh->b_data;
1035         inode->i_ctime = CURRENT_TIME;
1036         di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1037         di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1038
1039         ocfs2_journal_dirty(handle, di_bh);
1040
1041 out_commit:
1042         ocfs2_commit_trans(osb, handle);
1043
1044 out_inode_unlock:
1045         brelse(di_bh);
1046         ocfs2_inode_unlock(inode, 1);
1047 out_rw_unlock:
1048         ocfs2_rw_unlock(inode, 1);
1049 out:
1050         mutex_unlock(&inode->i_mutex);
1051
1052         return status;
1053 }
1054
1055 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1056 {
1057         int status;
1058
1059         struct inode *inode = filp->f_path.dentry->d_inode;
1060         struct ocfs2_move_extents range;
1061         struct ocfs2_move_extents_context *context = NULL;
1062
1063         status = mnt_want_write(filp->f_path.mnt);
1064         if (status)
1065                 return status;
1066
1067         if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1068                 goto out;
1069
1070         if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1071                 status = -EPERM;
1072                 goto out;
1073         }
1074
1075         context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1076         if (!context) {
1077                 status = -ENOMEM;
1078                 mlog_errno(status);
1079                 goto out;
1080         }
1081
1082         context->inode = inode;
1083         context->file = filp;
1084
1085         if (argp) {
1086                 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
1087                                    sizeof(range))) {
1088                         status = -EFAULT;
1089                         goto out;
1090                 }
1091         } else {
1092                 status = -EINVAL;
1093                 goto out;
1094         }
1095
1096         if (range.me_start > i_size_read(inode))
1097                 goto out;
1098
1099         if (range.me_start + range.me_len > i_size_read(inode))
1100                         range.me_len = i_size_read(inode) - range.me_start;
1101
1102         context->range = &range;
1103
1104         if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1105                 context->auto_defrag = 1;
1106                 /*
1107                  * ok, the default theshold for the defragmentation
1108                  * is 1M, since our maximum clustersize was 1M also.
1109                  * any thought?
1110                  */
1111                 if (!range.me_threshold)
1112                         range.me_threshold = 1024 * 1024;
1113
1114                 if (range.me_threshold > i_size_read(inode))
1115                         range.me_threshold = i_size_read(inode);
1116
1117                 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1118                         context->partial = 1;
1119         } else {
1120                 /*
1121                  * first best-effort attempt to validate and adjust the goal
1122                  * (physical address in block), while it can't guarantee later
1123                  * operation can succeed all the time since global_bitmap may
1124                  * change a bit over time.
1125                  */
1126
1127                 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1128                 if (status)
1129                         goto out;
1130         }
1131
1132         status = ocfs2_move_extents(context);
1133         if (status)
1134                 mlog_errno(status);
1135 out:
1136         /*
1137          * movement/defragmentation may end up being partially completed,
1138          * that's the reason why we need to return userspace the finished
1139          * length and new_offset even if failure happens somewhere.
1140          */
1141         if (argp) {
1142                 if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
1143                                 sizeof(range)))
1144                         status = -EFAULT;
1145         }
1146
1147         kfree(context);
1148
1149         mnt_drop_write(filp->f_path.mnt);
1150
1151         return status;
1152 }