lib/blame.c

   1 /*
   2  * Copyright (c) 2018, 2019, 2020 Stefan Sperling <stsp@openbsd.org>
   3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
   4  *
   5  * Permission to use, copy, modify, and distribute this software for any
   6  * purpose with or without fee is hereby granted, provided that the above
   7  * copyright notice and this permission notice appear in all copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16  */
  17
  18 #include <sys/mman.h>
  19 #include <sys/stat.h>
  20
  21 #include <errno.h>
  22 #include <sha1.h>
  23 #include <string.h>
  24 #include <stdio.h>
  25 #include <stdlib.h>
  26 #include <time.h>
  27 #include <limits.h>
  28 #include <zlib.h>
  29
  30 #include "got_compat.h"
  31
  32 #include "got_error.h"
  33 #include "got_object.h"
  34 #include "got_cancel.h"
  35 #include "got_blame.h"
  36 #include "got_commit_graph.h"
  37 #include "got_opentemp.h"
  38
  39 #include "got_lib_inflate.h"
  40 #include "got_lib_delta.h"
  41 #include "got_lib_object.h"
  42 #include "got_lib_diff.h"
  43
  44 #ifndef MAX
  45 #define MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
  46 #endif
  47
  48 struct got_blame_line {
  49         int annotated;
  50         struct got_object_id id;
  51 };
  52
  53 struct got_blame {
  54         struct diff_config *cfg;
  55         int nlines;     /* number of lines in file being blamed */
  56         int nannotated; /* number of lines already annotated */
  57         struct got_blame_line *lines; /* one per line */
  58         int ncommits;
  59
  60         /*
  61          * These change with every traversed commit. After diffing
  62          * commits N:N-1, in preparation for diffing commits N-1:N-2,
  63          * data for commit N is retained and flipped into data for N-1.
  64          *
  65          */
  66         FILE *f1; /* older version from commit N-1. */
  67         FILE *f2; /* newer version from commit N. */
  68         unsigned char *map1;
  69         unsigned char *map2;
  70         off_t size1;
  71         off_t size2;
  72         int nlines1;
  73         int nlines2;
  74         off_t *line_offsets1;
  75         off_t *line_offsets2;
  76
  77         /*
  78          * Map line numbers of an older version of the file to valid line
  79          * numbers in the version of the file being blamed. This map is
  80          * updated with each commit we traverse throughout the file's history.
  81          * Lines mapped to -1 do not correspond to any line in the version
  82          * being blamed.
  83          */
  84         int *linemap1;
  85         int *linemap2;
  86
  87         struct diff_data *data1;
  88         struct diff_data *data2;
  89 };
  90
  91 static const struct got_error *
  92 annotate_line(struct got_blame *blame, int lineno, struct got_object_id *id,
  93     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
  94     void *arg)
  95 {
  96         const struct got_error *err = NULL;
  97         struct got_blame_line *line;
  98
  99         if (lineno < 0 || lineno >= blame->nlines)
 100                 return NULL;
 101
 102         line = &blame->lines[lineno];
 103         if (line->annotated)
 104                 return NULL;
 105
 106         memcpy(&line->id, id, sizeof(line->id));
 107         line->annotated = 1;
 108         blame->nannotated++;
 109         if (cb)
 110                 err = cb(arg, blame->nlines, lineno + 1, id);
 111         return err;
 112 }
 113
 114 static const struct got_error *
 115 blame_changes(struct got_blame *blame, struct diff_result *diff_result,
 116     struct got_object_id *commit_id,
 117     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 118     void *arg)
 119 {
 120         const struct got_error *err = NULL;
 121         int i;
 122         int idx1 = 0, idx2 = 0;
 123
 124         for (i = 0; i < diff_result->chunks.len &&
 125             blame->nannotated < blame->nlines; i++) {
 126                 struct diff_chunk *c = diff_chunk_get(diff_result, i);
 127                 unsigned int left_start, left_count;
 128                 unsigned int right_start, right_count;
 129                 int j;
 130
 131                 /*
 132                  * We do not need to worry about idx1/idx2 growing out
 133                  * of bounds because the diff implementation ensures
 134                  * that chunk ranges never exceed the number of lines
 135                  * in the left/right input files.
 136                  */
 137                 left_start = diff_chunk_get_left_start(c, diff_result, 0);
 138                 left_count = diff_chunk_get_left_count(c);
 139                 right_start = diff_chunk_get_right_start(c, diff_result, 0);
 140                 right_count = diff_chunk_get_right_count(c);
 141
 142                 if (left_count == right_count) {
 143                         for (j = 0; j < left_count; j++) {
 144                                 blame->linemap1[idx1++] =
 145                                     blame->linemap2[idx2++];
 146                         }
 147                         continue;
 148                 }
 149
 150                 if (right_count == 0) {
 151                         for (j = 0; j < left_count; j++) {
 152                                 blame->linemap1[idx1++] = -1;
 153                         }
 154                         continue;
 155                 }
 156
 157                 for (j = 0; j < right_count; j++) {
 158                         int ln = blame->linemap2[idx2++];
 159                         err = annotate_line(blame, ln, commit_id, cb, arg);
 160                         if (err)
 161                                 return err;
 162                         if (blame->nlines == blame->nannotated)
 163                                 break;
 164                 }
 165         }
 166
 167         return NULL;
 168 }
 169
 170 static const struct got_error *
 171 blame_prepare_file(FILE *f, unsigned char **p, off_t *size,
 172     int *nlines, off_t **line_offsets, struct diff_data *diff_data,
 173     const struct diff_config *cfg, struct got_blob_object *blob)
 174 {
 175         const struct got_error *err = NULL;
 176         int diff_flags = 0, rc;
 177
 178         err = got_object_blob_dump_to_file(size, nlines, line_offsets,
 179             f, blob);
 180         if (err)
 181                 return err;
 182
 183 #ifndef GOT_DIFF_NO_MMAP
 184         *p = mmap(NULL, *size, PROT_READ, MAP_PRIVATE, fileno(f), 0);
 185         if (*p == MAP_FAILED)
 186 #endif
 187                 *p = NULL; /* fall back on file I/O */
 188
 189         /* Allow blaming lines in binary files even though it's useless. */
 190         diff_flags |= DIFF_FLAG_FORCE_TEXT_DATA;
 191
 192         rc = diff_atomize_file(diff_data, cfg, f, *p, *size, diff_flags);
 193         if (rc)
 194                 return got_error_set_errno(rc, "diff_atomize_file");
 195
 196         return NULL;
 197 }
 198
 199 static const struct got_error *
 200 blame_commit(struct got_blame *blame, struct got_object_id *id,
 201     const char *path, struct got_repository *repo,
 202     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 203     void *arg)
 204 {
 205         const struct got_error *err = NULL;
 206         struct got_commit_object *commit = NULL;
 207         struct got_object_qid *pid = NULL;
 208         struct got_object_id *pblob_id = NULL;
 209         struct got_blob_object *pblob = NULL;
 210         struct diff_result *diff_result = NULL;
 211
 212         err = got_object_open_as_commit(&commit, repo, id);
 213         if (err)
 214                 return err;
 215
 216         pid = STAILQ_FIRST(got_object_commit_get_parent_ids(commit));
 217         if (pid == NULL) {
 218                 got_object_commit_close(commit);
 219                 return NULL;
 220         }
 221
 222         err = got_object_id_by_path(&pblob_id, repo, pid->id, path);
 223         if (err) {
 224                 if (err->code == GOT_ERR_NO_TREE_ENTRY)
 225                         err = NULL;
 226                 goto done;
 227         }
 228
 229         err = got_object_open_as_blob(&pblob, repo, pblob_id, 8192);
 230         if (err)
 231                 goto done;
 232
 233         blame->f1 = got_opentemp();
 234         if (blame->f1 == NULL) {
 235                 err = got_error_from_errno("got_opentemp");
 236                 goto done;
 237         }
 238
 239         err = blame_prepare_file(blame->f1, &blame->map1, &blame->size1,
 240             &blame->nlines1, &blame->line_offsets1, blame->data1,
 241             blame->cfg, pblob);
 242         if (err)
 243                 goto done;
 244
 245         diff_result = diff_main(blame->cfg, blame->data1, blame->data2);
 246         if (diff_result == NULL) {
 247                 err = got_error_set_errno(ENOMEM, "malloc");
 248                 goto done;
 249         }
 250         if (diff_result->rc != DIFF_RC_OK) {
 251                 err = got_error_set_errno(diff_result->rc, "diff");
 252                 goto done;
 253         }
 254         if (diff_result->chunks.len > 0) {
 255                 if (blame->nlines1 > 0) {
 256                         blame->linemap1 = calloc(blame->nlines1,
 257                             sizeof(*blame->linemap1));
 258                         if (blame->linemap1 == NULL) {
 259                                 err = got_error_from_errno("malloc");
 260                                 goto done;
 261                         }
 262                 }
 263                 err = blame_changes(blame, diff_result, id, cb, arg);
 264                 if (err)
 265                         goto done;
 266         } else if (cb)
 267                 err = cb(arg, blame->nlines, -1, id);
 268 done:
 269         if (diff_result)
 270                 diff_result_free(diff_result);
 271         if (commit)
 272                 got_object_commit_close(commit);
 273         free(pblob_id);
 274         if (pblob)
 275                 got_object_blob_close(pblob);
 276         return err;
 277 }
 278
 279 static const struct got_error *
 280 blame_close(struct got_blame *blame)
 281 {
 282         const struct got_error *err = NULL;
 283
 284         diff_data_free(blame->data1);
 285         free(blame->data1);
 286         diff_data_free(blame->data2);
 287         free(blame->data2);
 288         if (blame->map1) {
 289                 if (munmap(blame->map1, blame->size1) == -1 && err == NULL)
 290                         err = got_error_from_errno("munmap");
 291         }
 292         if (blame->map2) {
 293                 if (munmap(blame->map2, blame->size2) == -1 && err == NULL)
 294                         err = got_error_from_errno("munmap");
 295         }
 296         if (blame->f1 && fclose(blame->f1) == EOF && err == NULL)
 297                 err = got_error_from_errno("fclose");
 298         if (blame->f2 && fclose(blame->f2) == EOF && err == NULL)
 299                 err = got_error_from_errno("fclose");
 300         free(blame->lines);
 301         free(blame->line_offsets1);
 302         free(blame->line_offsets2);
 303         free(blame->linemap1);
 304         free(blame->linemap2);
 305         free(blame->cfg);
 306         free(blame);
 307         return err;
 308 }
 309
 310 static int
 311 atomize_file(struct diff_data *d, FILE *f, off_t filesize, int nlines,
 312     off_t *line_offsets)
 313 {
 314         int i, rc = DIFF_RC_OK;
 315         int embedded_nul = 0;
 316
 317         ARRAYLIST_INIT(d->atoms, nlines);
 318
 319         for (i = 0; i < nlines; i++) {
 320                 struct diff_atom *atom;
 321                 off_t len, pos = line_offsets[i];
 322                 unsigned int hash = 0;
 323                 int j;
 324
 325                 ARRAYLIST_ADD(atom, d->atoms);
 326                 if (atom == NULL) {
 327                         rc = errno;
 328                         break;
 329                 }
 330
 331                 if (i < nlines - 1)
 332                         len = line_offsets[i + 1] - pos;
 333                 else
 334                         len = filesize - pos;
 335
 336                 if (fseeko(f, pos, SEEK_SET) == -1) {
 337                         rc = errno;
 338                         break;
 339                 }
 340                 for (j = 0; j < len; j++) {
 341                         int c = fgetc(f);
 342                         if (c == EOF) {
 343                                 if (feof(f))
 344                                         rc = EIO; /* unexpected EOF */
 345                                 else
 346                                         rc = errno;
 347                                 goto done;
 348                         }
 349
 350                         hash = diff_atom_hash_update(hash, (unsigned char)c);
 351
 352                         if (c == '\0')
 353                                 embedded_nul = 1;
 354
 355                 }
 356                 *atom = (struct diff_atom){
 357                         .root = d,
 358                         .pos = pos,
 359                         .at = NULL,     /* atom data is not memory-mapped */
 360                         .len = len,
 361                         .hash = hash,
 362                 };
 363         }
 364
 365         /* File are considered binary if they contain embedded '\0' bytes. */
 366         if (embedded_nul)
 367                 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 368 done:
 369         if (rc)
 370                 ARRAYLIST_FREE(d->atoms);
 371
 372         return rc;
 373 }
 374
 375 static int
 376 atomize_file_mmap(struct diff_data *d, unsigned char *p,
 377     off_t filesize, int nlines, off_t *line_offsets)
 378 {
 379         int i, rc = DIFF_RC_OK;
 380         int embedded_nul = 0;
 381
 382         ARRAYLIST_INIT(d->atoms, nlines);
 383
 384         for (i = 0; i < nlines; i++) {
 385                 struct diff_atom *atom;
 386                 off_t len, pos = line_offsets[i];
 387                 unsigned int hash = 0;
 388                 int j;
 389
 390                 ARRAYLIST_ADD(atom, d->atoms);
 391                 if (atom == NULL) {
 392                         rc = errno;
 393                         break;
 394                 }
 395
 396                 if (i < nlines - 1)
 397                         len = line_offsets[i + 1] - pos;
 398                 else
 399                         len = filesize - pos;
 400
 401                 for (j = 0; j < len; j++)
 402                         hash = diff_atom_hash_update(hash, p[pos + j]);
 403
 404                 if (!embedded_nul && memchr(&p[pos], '\0', len) != NULL)
 405                         embedded_nul = 1;
 406
 407                 *atom = (struct diff_atom){
 408                         .root = d,
 409                         .pos = pos,
 410                         .at = &p[pos],
 411                         .len = len,
 412                         .hash = hash,
 413                 };
 414         }
 415
 416         /* File are considered binary if they contain embedded '\0' bytes. */
 417         if (embedded_nul)
 418                 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 419
 420         if (rc)
 421                 ARRAYLIST_FREE(d->atoms);
 422
 423         return rc;
 424 }
 425
 426 /* Implements diff_atomize_func_t */
 427 static int
 428 blame_atomize_file(void *arg, struct diff_data *d)
 429 {
 430         struct got_blame *blame = arg;
 431
 432         if (d->f == blame->f1) {
 433                 if (blame->map1)
 434                         return atomize_file_mmap(d, blame->map1,
 435                             blame->size1, blame->nlines1,
 436                             blame->line_offsets1);
 437                 else
 438                         return atomize_file(d, blame->f1, blame->size1,
 439                             blame->nlines1, blame->line_offsets1);
 440         } else if (d->f == blame->f2) {
 441                 if (d->atoms.len > 0) {
 442                         /* Re-use data from previous commit. */
 443                         return DIFF_RC_OK;
 444                 }
 445                 if (blame->map2)
 446                         return atomize_file_mmap(d, blame->map2,
 447                             blame->size2, blame->nlines2,
 448                             blame->line_offsets2);
 449                 else
 450                         return atomize_file(d, blame->f2, blame->size2,
 451                             blame->nlines2, blame->line_offsets2);
 452         }
 453
 454         return DIFF_RC_OK;
 455 }
 456
 457 static const struct got_error *
 458 close_file2_and_reuse_file1(struct got_blame *blame)
 459 {
 460         struct diff_data *d;
 461
 462         free(blame->line_offsets2);
 463         blame->line_offsets2 = blame->line_offsets1;
 464         blame->line_offsets1 = NULL;
 465
 466         free(blame->linemap2);
 467         blame->linemap2 = blame->linemap1;
 468         blame->linemap1 = NULL;
 469
 470         if (blame->map2) {
 471                 if (munmap(blame->map2, blame->size2) == -1)
 472                         return got_error_from_errno("munmap");
 473                 blame->map2 = blame->map1;
 474                 blame->map1 = NULL;
 475
 476         }
 477         blame->size2 = blame->size1;
 478         blame->size1 = 0;
 479
 480         if (fclose(blame->f2) == EOF)
 481                 return got_error_from_errno("fclose");
 482         blame->f2 = blame->f1;
 483         blame->f1 = NULL;
 484
 485         blame->nlines2 = blame->nlines1;
 486         blame->nlines1 = 0;
 487
 488         diff_data_free(blame->data2); /* does not free pointer itself */
 489         memset(blame->data2, 0, sizeof(*blame->data2));
 490         d = blame->data2;
 491         blame->data2 = blame->data1;
 492         blame->data1 = d;
 493
 494         return NULL;
 495 }
 496
 497 static const struct got_error *
 498 blame_open(struct got_blame **blamep, const char *path,
 499     struct got_object_id *start_commit_id, struct got_repository *repo,
 500     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 501     void *arg, got_cancel_cb cancel_cb, void *cancel_arg)
 502 {
 503         const struct got_error *err = NULL;
 504         struct got_object_id *obj_id = NULL;
 505         struct got_blob_object *blob = NULL;
 506         struct got_blame *blame = NULL;
 507         struct got_object_id *id = NULL;
 508         int lineno;
 509         struct got_commit_graph *graph = NULL;
 510
 511         *blamep = NULL;
 512
 513         err = got_object_id_by_path(&obj_id, repo, start_commit_id, path);
 514         if (err)
 515                 goto done;
 516
 517         err = got_object_open_as_blob(&blob, repo, obj_id, 8192);
 518         if (err)
 519                 goto done;
 520
 521         blame = calloc(1, sizeof(*blame));
 522         if (blame == NULL) {
 523                 err = got_error_from_errno("calloc");
 524                 goto done;
 525         }
 526
 527         blame->data1 = calloc(1, sizeof(*blame->data1));
 528         if (blame->data1 == NULL) {
 529                 err = got_error_from_errno("calloc");
 530                 goto done;
 531         }
 532         blame->data2 = calloc(1, sizeof(*blame->data2));
 533         if (blame->data2 == NULL) {
 534                 err = got_error_from_errno("calloc");
 535                 goto done;
 536         }
 537
 538         blame->f2 = got_opentemp();
 539         if (blame->f2 == NULL) {
 540                 err = got_error_from_errno("got_opentemp");
 541                 goto done;
 542         }
 543         err = got_diff_get_config(&blame->cfg, GOT_DIFF_ALGORITHM_PATIENCE,
 544             blame_atomize_file, blame);
 545         if (err)
 546                 goto done;
 547
 548         err = blame_prepare_file(blame->f2, &blame->map2, &blame->size2,
 549             &blame->nlines2, &blame->line_offsets2, blame->data2,
 550             blame->cfg, blob);
 551         blame->nlines = blame->nlines2;
 552         if (err || blame->nlines == 0)
 553                 goto done;
 554
 555         got_object_blob_close(blob);
 556         blob = NULL;
 557
 558         /* Don't include \n at EOF in the blame line count. */
 559         if (blame->line_offsets2[blame->nlines - 1] == blame->size2)
 560                 blame->nlines--;
 561
 562         blame->lines = calloc(blame->nlines, sizeof(*blame->lines));
 563         if (blame->lines == NULL) {
 564                 err = got_error_from_errno("calloc");
 565                 goto done;
 566         }
 567
 568         blame->linemap2 = calloc(blame->nlines2, sizeof(*blame->linemap2));
 569         if (blame->linemap2 == NULL) {
 570                 err = got_error_from_errno("calloc");
 571                 goto done;
 572         }
 573         for (lineno = 0; lineno < blame->nlines2; lineno++)
 574                 blame->linemap2[lineno] = lineno;
 575
 576         err = got_commit_graph_open(&graph, path, 1);
 577         if (err)
 578                 goto done;
 579
 580         err = got_commit_graph_iter_start(graph, start_commit_id, repo,
 581             cancel_cb, cancel_arg);
 582         if (err)
 583                 goto done;
 584         for (;;) {
 585                 struct got_object_id *next_id;
 586                 err = got_commit_graph_iter_next(&next_id, graph, repo,
 587                     cancel_cb, cancel_arg);
 588                 if (err) {
 589                         if (err->code == GOT_ERR_ITER_COMPLETED) {
 590                                 err = NULL;
 591                                 break;
 592                         }
 593                         goto done;
 594                 }
 595                 if (next_id) {
 596                         id = next_id;
 597                         err = blame_commit(blame, id, path, repo, cb, arg);
 598                         if (err) {
 599                                 if (err->code == GOT_ERR_ITER_COMPLETED)
 600                                         err = NULL;
 601                                 goto done;
 602                         }
 603                         if (blame->nannotated == blame->nlines)
 604                                 break;
 605
 606                         err = close_file2_and_reuse_file1(blame);
 607                         if (err)
 608                                 goto done;
 609                 }
 610         }
 611
 612         if (id && blame->nannotated < blame->nlines) {
 613                 /* Annotate remaining non-annotated lines with last commit. */
 614                 for (lineno = 0; lineno < blame->nlines; lineno++) {
 615                         err = annotate_line(blame, lineno, id, cb, arg);
 616                         if (err)
 617                                 goto done;
 618                 }
 619         }
 620
 621 done:
 622         if (graph)
 623                 got_commit_graph_close(graph);
 624         free(obj_id);
 625         if (blob)
 626                 got_object_blob_close(blob);
 627         if (err) {
 628                 if (blame)
 629                         blame_close(blame);
 630         } else
 631                 *blamep = blame;
 632
 633         return err;
 634 }
 635
 636 const struct got_error *
 637 got_blame(const char *path, struct got_object_id *commit_id,
 638     struct got_repository *repo,
 639     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
 640     void *arg, got_cancel_cb cancel_cb, void* cancel_arg)
 641 {
 642         const struct got_error *err = NULL, *close_err = NULL;
 643         struct got_blame *blame;
 644         char *abspath;
 645
 646         if (asprintf(&abspath, "%s%s", path[0] == '/' ? "" : "/", path) == -1)
 647                 return got_error_from_errno2("asprintf", path);
 648
 649         err = blame_open(&blame, abspath, commit_id, repo, cb, arg,
 650             cancel_cb, cancel_arg);
 651         free(abspath);
 652         if (blame)
 653                 close_err = blame_close(blame);
 654         return err ? err : close_err;
 655 }