lib/blame.c

   1 /*
   2  * Copyright (c) 2018, 2019, 2020 Stefan Sperling <stsp@openbsd.org>
   3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
   4  *
   5  * Permission to use, copy, modify, and distribute this software for any
   6  * purpose with or without fee is hereby granted, provided that the above
   7  * copyright notice and this permission notice appear in all copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16  */
  17
  18 #include <sys/mman.h>
  19 #include <sys/stat.h>
  20
  21 #include <errno.h>
  22 #include <string.h>
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <time.h>
  26 #include <limits.h>
  27 #include <zlib.h>
  28
  29 #include "got_compat.h"
  30
  31 #include "got_error.h"
  32 #include "got_object.h"
  33 #include "got_cancel.h"
  34 #include "got_blame.h"
  35 #include "got_commit_graph.h"
  36 #include "got_opentemp.h"
  37
  38 #include "got_lib_inflate.h"
  39 #include "got_lib_delta.h"
  40 #include "got_lib_object.h"
  41 #include "got_lib_diff.h"
  42
  43 #ifndef MAX
  44 #define MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
  45 #endif
  46
  47 struct got_blame_line {
  48         int annotated;
  49         struct got_object_id id;
  50 };
  51
  52 struct got_blame {
  53         struct diff_config *cfg;
  54         int nlines;     /* number of lines in file being blamed */
  55         int nannotated; /* number of lines already annotated */
  56         struct got_blame_line *lines; /* one per line */
  57         int ncommits;
  58
  59         /*
  60          * These change with every traversed commit. After diffing
  61          * commits N:N-1, in preparation for diffing commits N-1:N-2,
  62          * data for commit N is retained and flipped into data for N-1.
  63          *
  64          */
  65         FILE *f1; /* older version from commit N-1. */
  66         FILE *f2; /* newer version from commit N. */
  67         unsigned char *map1;
  68         unsigned char *map2;
  69         off_t size1;
  70         off_t size2;
  71         int nlines1;
  72         int nlines2;
  73         off_t *line_offsets1;
  74         off_t *line_offsets2;
  75
  76         /*
  77          * Map line numbers of an older version of the file to valid line
  78          * numbers in the version of the file being blamed. This map is
  79          * updated with each commit we traverse throughout the file's history.
  80          * Lines mapped to -1 do not correspond to any line in the version
  81          * being blamed.
  82          */
  83         int *linemap1;
  84         int *linemap2;
  85
  86         struct diff_data *data1;
  87         struct diff_data *data2;
  88 };
  89
  90 static const struct got_error *
  91 annotate_line(struct got_blame *blame, int lineno, struct got_object_id *id,
  92     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
  93     void *arg)
  94 {
  95         const struct got_error *err = NULL;
  96         struct got_blame_line *line;
  97
  98         if (lineno < 0 || lineno >= blame->nlines)
  99                 return NULL;
 100
 101         line = &blame->lines[lineno];
 102         if (line->annotated)
 103                 return NULL;
 104
 105         memcpy(&line->id, id, sizeof(line->id));
 106         line->annotated = 1;
 107         blame->nannotated++;
 108         if (cb)
 109                 err = cb(arg, blame->nlines, lineno + 1, id);
 110         return err;
 111 }
 112
 113 static const struct got_error *
 114 blame_changes(struct got_blame *blame, struct diff_result *diff_result,
 115     struct got_object_id *commit_id,
 116     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 117     void *arg)
 118 {
 119         const struct got_error *err = NULL;
 120         int i;
 121         int idx1 = 0, idx2 = 0;
 122
 123         for (i = 0; i < diff_result->chunks.len &&
 124             blame->nannotated < blame->nlines; i++) {
 125                 struct diff_chunk *c = diff_chunk_get(diff_result, i);
 126                 unsigned int left_start, left_count;
 127                 unsigned int right_start, right_count;
 128                 int j;
 129
 130                 /*
 131                  * We do not need to worry about idx1/idx2 growing out
 132                  * of bounds because the diff implementation ensures
 133                  * that chunk ranges never exceed the number of lines
 134                  * in the left/right input files.
 135                  */
 136                 left_start = diff_chunk_get_left_start(c, diff_result, 0);
 137                 left_count = diff_chunk_get_left_count(c);
 138                 right_start = diff_chunk_get_right_start(c, diff_result, 0);
 139                 right_count = diff_chunk_get_right_count(c);
 140
 141                 if (left_count == right_count) {
 142                         for (j = 0; j < left_count; j++) {
 143                                 blame->linemap1[idx1++] =
 144                                     blame->linemap2[idx2++];
 145                         }
 146                         continue;
 147                 }
 148
 149                 if (right_count == 0) {
 150                         for (j = 0; j < left_count; j++) {
 151                                 blame->linemap1[idx1++] = -1;
 152                         }
 153                         continue;
 154                 }
 155
 156                 for (j = 0; j < right_count; j++) {
 157                         int ln = blame->linemap2[idx2++];
 158                         err = annotate_line(blame, ln, commit_id, cb, arg);
 159                         if (err)
 160                                 return err;
 161                         if (blame->nlines == blame->nannotated)
 162                                 break;
 163                 }
 164         }
 165
 166         return NULL;
 167 }
 168
 169 static const struct got_error *
 170 blame_prepare_file(FILE *f, unsigned char **p, off_t *size,
 171     int *nlines, off_t **line_offsets, struct diff_data *diff_data,
 172     const struct diff_config *cfg, struct got_blob_object *blob)
 173 {
 174         const struct got_error *err = NULL;
 175         int diff_flags = 0, rc;
 176
 177         err = got_object_blob_dump_to_file(size, nlines, line_offsets,
 178             f, blob);
 179         if (err)
 180                 return err;
 181
 182 #ifndef GOT_DIFF_NO_MMAP
 183         *p = mmap(NULL, *size, PROT_READ, MAP_PRIVATE, fileno(f), 0);
 184         if (*p == MAP_FAILED)
 185 #endif
 186                 *p = NULL; /* fall back on file I/O */
 187
 188         /* Allow blaming lines in binary files even though it's useless. */
 189         diff_flags |= DIFF_FLAG_FORCE_TEXT_DATA;
 190
 191         rc = diff_atomize_file(diff_data, cfg, f, *p, *size, diff_flags);
 192         if (rc)
 193                 return got_error_set_errno(rc, "diff_atomize_file");
 194
 195         return NULL;
 196 }
 197
 198 static const struct got_error *
 199 blame_commit(struct got_blame *blame, struct got_object_id *id,
 200     const char *path, struct got_repository *repo,
 201     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 202     void *arg)
 203 {
 204         const struct got_error *err = NULL;
 205         struct got_commit_object *commit = NULL;
 206         struct got_object_qid *pid = NULL;
 207         struct got_object_id *pblob_id = NULL;
 208         struct got_blob_object *pblob = NULL;
 209         struct diff_result *diff_result = NULL;
 210
 211         err = got_object_open_as_commit(&commit, repo, id);
 212         if (err)
 213                 return err;
 214
 215         pid = STAILQ_FIRST(got_object_commit_get_parent_ids(commit));
 216         if (pid == NULL) {
 217                 got_object_commit_close(commit);
 218                 return NULL;
 219         }
 220
 221         err = got_object_id_by_path(&pblob_id, repo, pid->id, path);
 222         if (err) {
 223                 if (err->code == GOT_ERR_NO_TREE_ENTRY)
 224                         err = NULL;
 225                 goto done;
 226         }
 227
 228         err = got_object_open_as_blob(&pblob, repo, pblob_id, 8192);
 229         if (err)
 230                 goto done;
 231
 232         blame->f1 = got_opentemp();
 233         if (blame->f1 == NULL) {
 234                 err = got_error_from_errno("got_opentemp");
 235                 goto done;
 236         }
 237
 238         err = blame_prepare_file(blame->f1, &blame->map1, &blame->size1,
 239             &blame->nlines1, &blame->line_offsets1, blame->data1,
 240             blame->cfg, pblob);
 241         if (err)
 242                 goto done;
 243
 244         diff_result = diff_main(blame->cfg, blame->data1, blame->data2);
 245         if (diff_result == NULL) {
 246                 err = got_error_set_errno(ENOMEM, "malloc");
 247                 goto done;
 248         }
 249         if (diff_result->rc != DIFF_RC_OK) {
 250                 err = got_error_set_errno(diff_result->rc, "diff");
 251                 goto done;
 252         }
 253         if (diff_result->chunks.len > 0) {
 254                 if (blame->nlines1 > 0) {
 255                         blame->linemap1 = calloc(blame->nlines1,
 256                             sizeof(*blame->linemap1));
 257                         if (blame->linemap1 == NULL) {
 258                                 err = got_error_from_errno("malloc");
 259                                 goto done;
 260                         }
 261                 }
 262                 err = blame_changes(blame, diff_result, id, cb, arg);
 263                 if (err)
 264                         goto done;
 265         } else if (cb)
 266                 err = cb(arg, blame->nlines, -1, id);
 267 done:
 268         if (diff_result)
 269                 diff_result_free(diff_result);
 270         if (commit)
 271                 got_object_commit_close(commit);
 272         free(pblob_id);
 273         if (pblob)
 274                 got_object_blob_close(pblob);
 275         return err;
 276 }
 277
 278 static const struct got_error *
 279 blame_close(struct got_blame *blame)
 280 {
 281         const struct got_error *err = NULL;
 282
 283         diff_data_free(blame->data1);
 284         free(blame->data1);
 285         diff_data_free(blame->data2);
 286         free(blame->data2);
 287         if (blame->map1) {
 288                 if (munmap(blame->map1, blame->size1) == -1 && err == NULL)
 289                         err = got_error_from_errno("munmap");
 290         }
 291         if (blame->map2) {
 292                 if (munmap(blame->map2, blame->size2) == -1 && err == NULL)
 293                         err = got_error_from_errno("munmap");
 294         }
 295         if (blame->f1 && fclose(blame->f1) == EOF && err == NULL)
 296                 err = got_error_from_errno("fclose");
 297         if (blame->f2 && fclose(blame->f2) == EOF && err == NULL)
 298                 err = got_error_from_errno("fclose");
 299         free(blame->lines);
 300         free(blame->line_offsets1);
 301         free(blame->line_offsets2);
 302         free(blame->linemap1);
 303         free(blame->linemap2);
 304         free(blame->cfg);
 305         free(blame);
 306         return err;
 307 }
 308
 309 static int
 310 atomize_file(struct diff_data *d, FILE *f, off_t filesize, int nlines,
 311     off_t *line_offsets)
 312 {
 313         int i, rc = DIFF_RC_OK;
 314         int embedded_nul = 0;
 315
 316         ARRAYLIST_INIT(d->atoms, nlines);
 317
 318         for (i = 0; i < nlines; i++) {
 319                 struct diff_atom *atom;
 320                 off_t len, pos = line_offsets[i];
 321                 unsigned int hash = 0;
 322                 int j;
 323
 324                 ARRAYLIST_ADD(atom, d->atoms);
 325                 if (atom == NULL) {
 326                         rc = errno;
 327                         break;
 328                 }
 329
 330                 if (i < nlines - 1)
 331                         len = line_offsets[i + 1] - pos;
 332                 else
 333                         len = filesize - pos;
 334
 335                 if (fseeko(f, pos, SEEK_SET) == -1) {
 336                         rc = errno;
 337                         break;
 338                 }
 339                 for (j = 0; j < len; j++) {
 340                         int c = fgetc(f);
 341                         if (c == EOF) {
 342                                 if (feof(f))
 343                                         rc = EIO; /* unexpected EOF */
 344                                 else
 345                                         rc = errno;
 346                                 goto done;
 347                         }
 348
 349                         hash = diff_atom_hash_update(hash, (unsigned char)c);
 350
 351                         if (c == '\0')
 352                                 embedded_nul = 1;
 353
 354                 }
 355                 *atom = (struct diff_atom){
 356                         .root = d,
 357                         .pos = pos,
 358                         .at = NULL,     /* atom data is not memory-mapped */
 359                         .len = len,
 360                         .hash = hash,
 361                 };
 362         }
 363
 364         /* File are considered binary if they contain embedded '\0' bytes. */
 365         if (embedded_nul)
 366                 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 367 done:
 368         if (rc)
 369                 ARRAYLIST_FREE(d->atoms);
 370
 371         return rc;
 372 }
 373
 374 static int
 375 atomize_file_mmap(struct diff_data *d, unsigned char *p,
 376     off_t filesize, int nlines, off_t *line_offsets)
 377 {
 378         int i, rc = DIFF_RC_OK;
 379         int embedded_nul = 0;
 380
 381         ARRAYLIST_INIT(d->atoms, nlines);
 382
 383         for (i = 0; i < nlines; i++) {
 384                 struct diff_atom *atom;
 385                 off_t len, pos = line_offsets[i];
 386                 unsigned int hash = 0;
 387                 int j;
 388
 389                 ARRAYLIST_ADD(atom, d->atoms);
 390                 if (atom == NULL) {
 391                         rc = errno;
 392                         break;
 393                 }
 394
 395                 if (i < nlines - 1)
 396                         len = line_offsets[i + 1] - pos;
 397                 else
 398                         len = filesize - pos;
 399
 400                 for (j = 0; j < len; j++)
 401                         hash = diff_atom_hash_update(hash, p[pos + j]);
 402
 403                 if (!embedded_nul && memchr(&p[pos], '\0', len) != NULL)
 404                         embedded_nul = 1;
 405
 406                 *atom = (struct diff_atom){
 407                         .root = d,
 408                         .pos = pos,
 409                         .at = &p[pos],
 410                         .len = len,
 411                         .hash = hash,
 412                 };
 413         }
 414
 415         /* File are considered binary if they contain embedded '\0' bytes. */
 416         if (embedded_nul)
 417                 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 418
 419         if (rc)
 420                 ARRAYLIST_FREE(d->atoms);
 421
 422         return rc;
 423 }
 424
 425 /* Implements diff_atomize_func_t */
 426 static int
 427 blame_atomize_file(void *arg, struct diff_data *d)
 428 {
 429         struct got_blame *blame = arg;
 430
 431         if (d->f == blame->f1) {
 432                 if (blame->map1)
 433                         return atomize_file_mmap(d, blame->map1,
 434                             blame->size1, blame->nlines1,
 435                             blame->line_offsets1);
 436                 else
 437                         return atomize_file(d, blame->f1, blame->size1,
 438                             blame->nlines1, blame->line_offsets1);
 439         } else if (d->f == blame->f2) {
 440                 if (d->atoms.len > 0) {
 441                         /* Re-use data from previous commit. */
 442                         return DIFF_RC_OK;
 443                 }
 444                 if (blame->map2)
 445                         return atomize_file_mmap(d, blame->map2,
 446                             blame->size2, blame->nlines2,
 447                             blame->line_offsets2);
 448                 else
 449                         return atomize_file(d, blame->f2, blame->size2,
 450                             blame->nlines2, blame->line_offsets2);
 451         }
 452
 453         return DIFF_RC_OK;
 454 }
 455
 456 static const struct got_error *
 457 close_file2_and_reuse_file1(struct got_blame *blame)
 458 {
 459         struct diff_data *d;
 460
 461         free(blame->line_offsets2);
 462         blame->line_offsets2 = blame->line_offsets1;
 463         blame->line_offsets1 = NULL;
 464
 465         free(blame->linemap2);
 466         blame->linemap2 = blame->linemap1;
 467         blame->linemap1 = NULL;
 468
 469         if (blame->map2) {
 470                 if (munmap(blame->map2, blame->size2) == -1)
 471                         return got_error_from_errno("munmap");
 472                 blame->map2 = blame->map1;
 473                 blame->map1 = NULL;
 474
 475         }
 476         blame->size2 = blame->size1;
 477         blame->size1 = 0;
 478
 479         if (fclose(blame->f2) == EOF)
 480                 return got_error_from_errno("fclose");
 481         blame->f2 = blame->f1;
 482         blame->f1 = NULL;
 483
 484         blame->nlines2 = blame->nlines1;
 485         blame->nlines1 = 0;
 486
 487         diff_data_free(blame->data2); /* does not free pointer itself */
 488         memset(blame->data2, 0, sizeof(*blame->data2));
 489         d = blame->data2;
 490         blame->data2 = blame->data1;
 491         blame->data1 = d;
 492
 493         return NULL;
 494 }
 495
 496 static const struct got_error *
 497 blame_open(struct got_blame **blamep, const char *path,
 498     struct got_object_id *start_commit_id, struct got_repository *repo,
 499     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 500     void *arg, got_cancel_cb cancel_cb, void *cancel_arg)
 501 {
 502         const struct got_error *err = NULL;
 503         struct got_object_id *obj_id = NULL;
 504         struct got_blob_object *blob = NULL;
 505         struct got_blame *blame = NULL;
 506         struct got_object_id *id = NULL;
 507         int lineno;
 508         struct got_commit_graph *graph = NULL;
 509
 510         *blamep = NULL;
 511
 512         err = got_object_id_by_path(&obj_id, repo, start_commit_id, path);
 513         if (err)
 514                 goto done;
 515
 516         err = got_object_open_as_blob(&blob, repo, obj_id, 8192);
 517         if (err)
 518                 goto done;
 519
 520         blame = calloc(1, sizeof(*blame));
 521         if (blame == NULL) {
 522                 err = got_error_from_errno("calloc");
 523                 goto done;
 524         }
 525
 526         blame->data1 = calloc(1, sizeof(*blame->data1));
 527         if (blame->data1 == NULL) {
 528                 err = got_error_from_errno("calloc");
 529                 goto done;
 530         }
 531         blame->data2 = calloc(1, sizeof(*blame->data2));
 532         if (blame->data2 == NULL) {
 533                 err = got_error_from_errno("calloc");
 534                 goto done;
 535         }
 536
 537         blame->f2 = got_opentemp();
 538         if (blame->f2 == NULL) {
 539                 err = got_error_from_errno("got_opentemp");
 540                 goto done;
 541         }
 542         err = got_diff_get_config(&blame->cfg, GOT_DIFF_ALGORITHM_PATIENCE,
 543             blame_atomize_file, blame);
 544         if (err)
 545                 goto done;
 546
 547         err = blame_prepare_file(blame->f2, &blame->map2, &blame->size2,
 548             &blame->nlines2, &blame->line_offsets2, blame->data2,
 549             blame->cfg, blob);
 550         blame->nlines = blame->nlines2;
 551         if (err || blame->nlines == 0)
 552                 goto done;
 553
 554         got_object_blob_close(blob);
 555         blob = NULL;
 556
 557         /* Don't include \n at EOF in the blame line count. */
 558         if (blame->line_offsets2[blame->nlines - 1] == blame->size2)
 559                 blame->nlines--;
 560
 561         blame->lines = calloc(blame->nlines, sizeof(*blame->lines));
 562         if (blame->lines == NULL) {
 563                 err = got_error_from_errno("calloc");
 564                 goto done;
 565         }
 566
 567         blame->linemap2 = calloc(blame->nlines2, sizeof(*blame->linemap2));
 568         if (blame->linemap2 == NULL) {
 569                 err = got_error_from_errno("calloc");
 570                 goto done;
 571         }
 572         for (lineno = 0; lineno < blame->nlines2; lineno++)
 573                 blame->linemap2[lineno] = lineno;
 574
 575         err = got_commit_graph_open(&graph, path, 1);
 576         if (err)
 577                 goto done;
 578
 579         err = got_commit_graph_iter_start(graph, start_commit_id, repo,
 580             cancel_cb, cancel_arg);
 581         if (err)
 582                 goto done;
 583         for (;;) {
 584                 struct got_object_id *next_id;
 585                 err = got_commit_graph_iter_next(&next_id, graph, repo,
 586                     cancel_cb, cancel_arg);
 587                 if (err) {
 588                         if (err->code == GOT_ERR_ITER_COMPLETED) {
 589                                 err = NULL;
 590                                 break;
 591                         }
 592                         goto done;
 593                 }
 594                 if (next_id) {
 595                         id = next_id;
 596                         err = blame_commit(blame, id, path, repo, cb, arg);
 597                         if (err) {
 598                                 if (err->code == GOT_ERR_ITER_COMPLETED)
 599                                         err = NULL;
 600                                 goto done;
 601                         }
 602                         if (blame->nannotated == blame->nlines)
 603                                 break;
 604
 605                         err = close_file2_and_reuse_file1(blame);
 606                         if (err)
 607                                 goto done;
 608                 }
 609         }
 610
 611         if (id && blame->nannotated < blame->nlines) {
 612                 /* Annotate remaining non-annotated lines with last commit. */
 613                 for (lineno = 0; lineno < blame->nlines; lineno++) {
 614                         err = annotate_line(blame, lineno, id, cb, arg);
 615                         if (err)
 616                                 goto done;
 617                 }
 618         }
 619
 620 done:
 621         if (graph)
 622                 got_commit_graph_close(graph);
 623         free(obj_id);
 624         if (blob)
 625                 got_object_blob_close(blob);
 626         if (err) {
 627                 if (blame)
 628                         blame_close(blame);
 629         } else
 630                 *blamep = blame;
 631
 632         return err;
 633 }
 634
 635 const struct got_error *
 636 got_blame(const char *path, struct got_object_id *commit_id,
 637     struct got_repository *repo,
 638     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 639     void *arg, got_cancel_cb cancel_cb, void* cancel_arg)
 640 {
 641         const struct got_error *err = NULL, *close_err = NULL;
 642         struct got_blame *blame;
 643         char *abspath;
 644
 645         if (asprintf(&abspath, "%s%s", path[0] == '/' ? "" : "/", path) == -1)
 646                 return got_error_from_errno2("asprintf", path);
 647
 648         err = blame_open(&blame, abspath, commit_id, repo, cb, arg,
 649             cancel_cb, cancel_arg);
 650         free(abspath);
 651         if (blame)
 652                 close_err = blame_close(blame);
 653         return err ? err : close_err;
 654 }