src/viewer/ascii.c

   1 /*
   2    Internal file viewer for the Midnight Commander
   3    Function for plain view
   4
   5    Copyright (C) 1994-2015
   6    Free Software Foundation, Inc.
   7
   8    Written by:
   9    Miguel de Icaza, 1994, 1995, 1998
  10    Janne Kukonlehto, 1994, 1995
  11    Jakub Jelinek, 1995
  12    Joseph M. Hinkle, 1996
  13    Norbert Warmuth, 1997
  14    Pavel Machek, 1998
  15    Roland Illig <roland.illig@gmx.de>, 2004, 2005
  16    Slava Zanko <slavazanko@google.com>, 2009
  17    Andrew Borodin <aborodin@vmail.ru>, 2009-2014
  18    Ilia Maslakov <il.smind@gmail.com>, 2009
  19    Rewritten almost from scratch by:
  20    Egmont Koblinger <egmont@gmail.com>, 2014
  21
  22    This file is part of the Midnight Commander.
  23
  24    The Midnight Commander is free software: you can redistribute it
  25    and/or modify it under the terms of the GNU General Public License as
  26    published by the Free Software Foundation, either version 3 of the License,
  27    or (at your option) any later version.
  28
  29    The Midnight Commander is distributed in the hope that it will be useful,
  30    but WITHOUT ANY WARRANTY; without even the implied warranty of
  31    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  32    GNU General Public License for more details.
  33
  34    You should have received a copy of the GNU General Public License
  35    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  36
  37    ------------------------------------------------------------------------------------------------
  38
  39    The viewer is implemented along the following design principles:
  40
  41    Goals: Always display simple scripts, double wide (CJK), combining accents and spacing marks
  42    (often used e.g. in Devanagari) perfectly. Make the arrow keys always work correctly.
  43
  44    Absolutely non-goal: RTL.
  45
  46    Terminology:
  47
  48    - A "paragraph" is the text between two adjacent newline characters. A "line" or "row" is a
  49    visual row on the screen. In wrap mode, the viewer formats a paragraph into one or more lines.
  50
  51    - The Unicode glossary <http://www.unicode.org/glossary/> doesn't seem to have a notion of "base
  52    character followed by zero or more combining characters". The closest matches are "Combining
  53    Character Sequence" meaning a base character followed by one or more combining characters, or
  54    "Grapheme" which seems to exclude non-printable characters such as newline. In this file,
  55    "combining character sequence" (or any obvious abbreviation thereof) means a base character
  56    followed by zero or more (up to a current limit of 4) combining characters.
  57
  58    ------------------------------------------------------------------------------------------------
  59
  60    The parser-formatter is designed to be stateless across paragraphs. This is so that we can walk
  61    backwards without having to reparse the whole file (although we still need to reparse and
  62    reformat the whole paragraph, but it's a lot better). This principle needs to be changed if we
  63    ever get to address tickets 1849/2977, but then we can still store (for efficiency) the parser
  64    state at the beginning of the paragraph, and safely walk backwards if we don't cross an escape
  65    character.
  66
  67    The parser-formatter, however, definitely needs to carry a state across lines. Currently this
  68    state contains:
  69
  70    - The logical column (as if we didn't wrap). This is used for handling TAB characters after a
  71    wordwrap consistently with less.
  72
  73    - Whether the last nroff character was bold or underlined. This is used for displaying the
  74    ambiguous _\b_ sequence consistently with less.
  75
  76    - Whether the desired way of displaying a lonely combining accent or spacing mark is to place it
  77    over a dotted circle (we do this at the beginning of the paragraph of after a TAB), or to ignore
  78    the combining char and show replacement char for the spacing mark (we do this if e.g. too many
  79    of these were encountered and hence we don't glue them with their base character).
  80
  81    - (This state needs to be expanded if e.g. we decide to print verbose replacement characters
  82    (e.g. "<U+0080>") and allow these to wrap around lines.)
  83
  84    The state also contains the file offset, as it doesn't make sense to ever know the state without
  85    knowing the corresponding offset.
  86
  87    The state depends on various settings (viewer width, encoding, nroff mode, charwrap or wordwrap
  88    mode (if we'll have that one day) etc.), needs to be recomputed if any of these changes.
  89
  90    Walking forwards is usually relatively easy both in the file and on the screen. Walking
  91    backwards within a paragraph would only be possible in some special cases and even then it would
  92    be painful, so we always walk back to the beginning of the paragraph and reparse-reformat from
  93    there.
  94
  95    (Walking back within a line in the file would have at least the following difficulties: handling
  96    the parser state; processing invalid UTF-8; processing invalid nroff (e.g. what is "_\bA\bA"?).
  97    Walking back on the display: we wouldn't know where to display the last line of a paragraph, or
  98    where to display a line if its following line starts with a wide (CJK or Tab) character. Long
  99    story short: just forget this approach.)
 100
 101    Most important variables:
 102
 103    - dpy_start: Both in unwrap and wrap modes this points to the beginning of the topmost displayed
 104    paragraph.
 105
 106    - dpy_text_column: Only in unwrap mode, an additional horizontal scroll.
 107
 108    - dpy_paragraph_skip_lines: Only in wrap mode, an additional vertical scroll (the number of
 109    lines that are scrolled off at the top from the topmost paragraph).
 110
 111    - dpy_state_top: Only in wrap mode, the offset and parser-formatter state at the line where
 112    displaying the file begins is cached here.
 113
 114    - dpy_wrap_dirty: If some parameter has changed that makes it necessary to reparse-redisplay the
 115    topmost paragraph.
 116
 117    In wrap mode, the three variables "dpy_start", "dpy_paragraph_skip_lines" and "dpy_state_top"
 118    are kept consistent. Think of the first two as the ones describing the position, and the third
 119    as a cached value for better performance so that we don't need to wrap the invisible beginning
 120    of the topmost paragraph over and over again. The third value needs to be recomputed each time a
 121    parameter that influences parsing or displaying the file (e.g. width of screen, encoding, nroff
 122    mode) changes, this is signaled by "dpy_wrap_dirty" to force recomputing "dpy_state_top" (and
 123    clamp "dpy_paragraph_skip_lines" if necessary).
 124
 125    ------------------------------------------------------------------------------------------------
 126
 127    Help integration
 128
 129    I'm planning to port the help viewer to this codebase.
 130
 131    Splitting at sections would still happen in the help viewer. It would either copy a section, or
 132    set force_max and a similar force_min to limit displaying to one section only.
 133
 134    Parsing the help format would go next to the nroff parser. The colors, alternate character set,
 135    and emitting the version number would go to the "state". (The version number would be
 136    implemented by emitting remaining characters of a buffer in the "state" one by one, without
 137    advancing in the file position.)
 138
 139    The active link would be drawn similarly to the search highlight. Other than that, the viewer
 140    wouldn't care about links (except for their color). help.c would keep track of which one is
 141    highlighted, how to advance to the next/prev on an arrow, how the scroll offset needs to be
 142    adjusted when moving, etc.
 143
 144    Add wrapping at word boundaries to where wrapping at char boundaries happens now.
 145  */
 146
 147 #include <config.h>
 148
 149 #include "lib/global.h"
 150 #include "lib/tty/tty.h"
 151 #include "lib/skin.h"
 152 #include "lib/util.h"           /* is_printable() */
 153 #ifdef HAVE_CHARSET
 154 #include "lib/charsets.h"
 155 #endif
 156
 157 #include "src/setup.h"          /* option_tab_spacing */
 158
 159 #include "internal.h"
 160
 161 /*** global variables ****************************************************************************/
 162
 163 /*** file scope macro definitions ****************************************************************/
 164
 165 #if GLIB_CHECK_VERSION (2, 30, 0)
 166 #define SPACING_MARK G_UNICODE_SPACING_MARK
 167 #else
 168 #define SPACING_MARK G_UNICODE_COMBINING_MARK
 169 #endif
 170
 171 /* The Unicode standard recommends that lonely combining characters are printed over a dotted
 172  * circle. If the terminal is not UTF-8, this will be replaced by a dot anyway. */
 173 #define BASE_CHARACTER_FOR_LONELY_COMBINING 0x25CC      /* dotted circle */
 174 #define MAX_COMBINING_CHARS 4   /* both slang and ncurses support exactly 4 */
 175
 176 /* I think anything other than space (e.g. arrows) just introduce visual clutter without actually
 177  * adding value. */
 178 #define PARTIAL_CJK_AT_LEFT_MARGIN  ' '
 179 #define PARTIAL_CJK_AT_RIGHT_MARGIN ' '
 180
 181 /*
 182  * Wrap mode: This is for safety so that jumping to the end of file (which already includes
 183  * scrolling back by a page) and then walking backwards is reasonably fast, even if the file is
 184  * extremely large and consists of maybe full zeros or something like that. If there's no newline
 185  * found within this limit, just start displaying from there and see what happens. We might get
 186  * some displaying parameteres (most importantly the columns) incorrect, but at least will show the
 187  * file without spinning the CPU for ages. When scrolling back to that point, the user might see a
 188  * garbled first line (even starting with an invalid partial UTF-8), but then walking back by yet
 189  * another line should fix it.
 190  *
 191  * Unwrap mode: This is not used, we wouldn't be able to do anything reasonable without walking
 192  * back a whole paragraph (well, view->data_area.height paragraphs actually).
 193  */
 194 #define MAX_BACKWARDS_WALK_IN_PARAGRAPH (100 * 1000)
 195
 196 /*** file scope type declarations ****************************************************************/
 197
 198 /*** file scope variables ************************************************************************/
 199
 200 /* --------------------------------------------------------------------------------------------- */
 201 /*** file scope functions ************************************************************************/
 202 /* --------------------------------------------------------------------------------------------- */
 203
 204 /* TODO: These methods shouldn't be necessary, see ticket 3257 */
 205
 206 static int
 207 mcview_wcwidth (const mcview_t * view, int c)
 208 {
 209 #ifdef HAVE_CHARSET
 210     if (view->utf8)
 211     {
 212         if (g_unichar_iswide (c))
 213             return 2;
 214         if (g_unichar_iszerowidth (c))
 215             return 0;
 216     }
 217 #else
 218     (void) view;
 219     (void) c;
 220 #endif /* HAVE_CHARSET */
 221     return 1;
 222 }
 223
 224 /* --------------------------------------------------------------------------------------------- */
 225
 226 static gboolean
 227 mcview_ismark (const mcview_t * view, int c)
 228 {
 229 #ifdef HAVE_CHARSET
 230     if (view->utf8)
 231         return g_unichar_ismark (c);
 232 #else
 233     (void) view;
 234     (void) c;
 235 #endif /* HAVE_CHARSET */
 236     return FALSE;
 237 }
 238
 239 /* --------------------------------------------------------------------------------------------- */
 240
 241 /* actually is_non_spacing_mark_or_enclosing_mark */
 242 static gboolean
 243 mcview_is_non_spacing_mark (const mcview_t * view, int c)
 244 {
 245 #ifdef HAVE_CHARSET
 246     if (view->utf8)
 247     {
 248         GUnicodeType type;
 249
 250         type = g_unichar_type (c);
 251
 252         return type == G_UNICODE_NON_SPACING_MARK || type == G_UNICODE_ENCLOSING_MARK;
 253     }
 254 #else
 255     (void) view;
 256     (void) c;
 257 #endif /* HAVE_CHARSET */
 258     return FALSE;
 259 }
 260
 261 /* --------------------------------------------------------------------------------------------- */
 262
 263 #if 0
 264 static gboolean
 265 mcview_is_spacing_mark (const mcview_t * view, int c)
 266 {
 267 #ifdef HAVE_CHARSET
 268     if (view->utf8)
 269         return g_unichar_type (c) == SPACING_MARK;
 270 #else
 271     (void) view;
 272     (void) c;
 273 #endif /* HAVE_CHARSET */
 274     return FALSE;
 275 }
 276 #endif /* 0 */
 277
 278 /* --------------------------------------------------------------------------------------------- */
 279
 280 static gboolean
 281 mcview_isprint (const mcview_t * view, int c)
 282 {
 283 #ifdef HAVE_CHARSET
 284     if (!view->utf8)
 285         c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
 286     return g_unichar_isprint (c);
 287 #else
 288     (void) view;
 289 #endif /* HAVE_CHARSET */
 290     /* TODO this is very-very buggy by design: ticket 3257 comments 0-1 */
 291     return is_printable (c);
 292 }
 293
 294 /* --------------------------------------------------------------------------------------------- */
 295
 296 static int
 297 mcview_char_display (const mcview_t * view, int c, char *s)
 298 {
 299 #ifdef HAVE_CHARSET
 300     if (mc_global.utf8_display)
 301     {
 302         if (!view->utf8)
 303             c = convert_from_8bit_to_utf_c ((unsigned char) c, view->converter);
 304         if (!g_unichar_isprint (c))
 305             c = '.';
 306         return g_unichar_to_utf8 (c, s);
 307     }
 308     if (view->utf8)
 309     {
 310         if (g_unichar_iswide (c))
 311         {
 312             s[0] = s[1] = '.';
 313             return 2;
 314         }
 315         if (g_unichar_iszerowidth (c))
 316             return 0;
 317         /* TODO the is_printable check below will be broken for this */
 318         c = convert_from_utf_to_current_c (c, view->converter);
 319     }
 320     else
 321     {
 322         /* TODO the is_printable check below will be broken for this */
 323         c = convert_to_display_c (c);
 324     }
 325 #else
 326     (void) view;
 327 #endif /* HAVE_CHARSET */
 328     /* TODO this is very-very buggy by design: ticket 3257 comments 0-1 */
 329     if (!is_printable (c))
 330         c = '.';
 331     *s = c;
 332     return 1;
 333 }
 334
 335 /* --------------------------------------------------------------------------------------------- */
 336
 337 /**
 338  * Just for convenience, a common interface in front of mcview_get_utf and mcview_get_byte, so that
 339  * the caller doesn't have to care about utf8 vs 8-bit modes.
 340  *
 341  * Normally: stores c, updates state, returns TRUE.
 342  * At EOF: state is unchanged, c is undefined, returns FALSE.
 343  *
 344  * Also, temporary hack: handle force_max here.
 345  * TODO: move it to lower layers (datasource.c)?
 346  */
 347 static gboolean
 348 mcview_get_next_char (mcview_t * view, mcview_state_machine_t * state, int *c)
 349 {
 350     /* Pretend EOF if we reached force_max */
 351     if (view->force_max >= 0 && state->offset >= view->force_max)
 352         return FALSE;
 353
 354 #ifdef HAVE_CHARSET
 355     if (view->utf8)
 356     {
 357         gboolean result;
 358         int char_length;
 359
 360         *c = mcview_get_utf (view, state->offset, &char_length, &result);
 361         if (!result)
 362             return FALSE;
 363         /* Pretend EOF if we crossed force_max */
 364         if (view->force_max >= 0 && state->offset + char_length > view->force_max)
 365             return FALSE;
 366
 367         state->offset += char_length;
 368         return TRUE;
 369     }
 370 #endif /* HAVE_CHARSET */
 371     if (!mcview_get_byte (view, state->offset, c))
 372         return FALSE;
 373     state->offset++;
 374     return TRUE;
 375 }
 376
 377 /* --------------------------------------------------------------------------------------------- */
 378 /**
 379  * This function parses the next nroff character and gives it to you along with its desired color,
 380  * so you never have to care about nroff again.
 381  *
 382  * The nroff mode does the backspace trick for every single character (Unicode codepoint). At least
 383  * that's what the GNU groff 1.22 package produces, and that's what less 458 expects. For
 384  * double-wide characters (CJK), still only a single backspace is emitted. For combining accents
 385  * and such, the print-backspace-print step is repeated for the base character and then for each
 386  * accent separately.
 387  *
 388  * So, the right place for this layer is after the bytes are interpreted in UTF-8, but before
 389  * joining a base character with its combining accents.
 390  *
 391  * Normally: stores c and color, updates state, returns TRUE.
 392  * At EOF: state is unchanged, c and color are undefined, returns FALSE.
 393  *
 394  * color can be null if the caller doesn't care.
 395  */
 396 static gboolean
 397 mcview_get_next_maybe_nroff_char (mcview_t * view, mcview_state_machine_t * state, int *c,
 398                                   int *color)
 399 {
 400     mcview_state_machine_t state_after_nroff;
 401     int c2, c3;
 402
 403     if (color != NULL)
 404         *color = VIEW_NORMAL_COLOR;
 405
 406     if (!view->text_nroff_mode)
 407         return mcview_get_next_char (view, state, c);
 408
 409     if (!mcview_get_next_char (view, state, c))
 410         return FALSE;
 411     /* Don't allow nroff formatting around CR, LF, TAB or other special chars */
 412     if (!mcview_isprint (view, *c))
 413         return TRUE;
 414
 415     state_after_nroff = *state;
 416
 417     if (!mcview_get_next_char (view, &state_after_nroff, &c2))
 418         return TRUE;
 419     if (c2 != '\b')
 420         return TRUE;
 421
 422     if (!mcview_get_next_char (view, &state_after_nroff, &c3))
 423         return TRUE;
 424     if (!mcview_isprint (view, c3))
 425         return TRUE;
 426
 427     if (*c == '_' && c3 == '_')
 428     {
 429         *state = state_after_nroff;
 430         if (color != NULL)
 431             *color =
 432                 state->nroff_underscore_is_underlined ? VIEW_UNDERLINED_COLOR : VIEW_BOLD_COLOR;
 433     }
 434     else if (*c == c3)
 435     {
 436         *state = state_after_nroff;
 437         state->nroff_underscore_is_underlined = FALSE;
 438         if (color != NULL)
 439             *color = VIEW_BOLD_COLOR;
 440     }
 441     else if (*c == '_')
 442     {
 443         *c = c3;
 444         *state = state_after_nroff;
 445         state->nroff_underscore_is_underlined = TRUE;
 446         if (color != NULL)
 447             *color = VIEW_UNDERLINED_COLOR;
 448     }
 449
 450     return TRUE;
 451 }
 452
 453 /* --------------------------------------------------------------------------------------------- */
 454 /**
 455  * Get one base character, along with its combining or spacing mark characters.
 456  *
 457  * (A spacing mark is a character that extends the base character's width 1 into a combined
 458  * character of width 2, yet these two character cells should not be separated. E.g. Devanagari
 459  * <U+0939><U+094B>.)
 460  *
 461  * This method exists mainly for two reasons. One is to be able to tell if we fit on the current
 462  * line or need to wrap to the next one. The other is that both slang and ncurses seem to require
 463  * that the character and its combining marks are printed in a single call (or is it just a
 464  * limitation of mc's wrapper to them?).
 465  *
 466  * For convenience, this method takes care of converting CR or CR+LF into LF.
 467  * TODO this should probably happen later, when displaying the file?
 468  *
 469  * Normally: stores cs and color, updates state, returns >= 1 (entries in cs).
 470  * At EOF: state is unchanged, cs and color are undefined, returns 0.
 471  *
 472  * @param view ...
 473  * @param state the parser-formatter state machine's state, updated
 474  * @param cs store the characters here
 475  * @param clen the room available in cs (that is, at most clen-1 combining marks are allowed), must
 476  *   be at least 2
 477  * @param color if non-NULL, store the color here, taken from the first codepoint's color
 478  * @return the number of entries placed in cs, or 0 on EOF
 479  */
 480 static int
 481 mcview_next_combining_char_sequence (mcview_t * view, mcview_state_machine_t * state, int *cs,
 482                                      int clen, int *color)
 483 {
 484     int i = 1;
 485
 486     if (!mcview_get_next_maybe_nroff_char (view, state, cs, color))
 487         return 0;
 488
 489     /* Process \r and \r\n newlines. */
 490     if (cs[0] == '\r')
 491     {
 492         int cnext;
 493
 494         mcview_state_machine_t state_after_crlf = *state;
 495         if (mcview_get_next_maybe_nroff_char (view, &state_after_crlf, &cnext, NULL)
 496             && cnext == '\n')
 497             *state = state_after_crlf;
 498         cs[0] = '\n';
 499         return 1;
 500     }
 501
 502     /* We don't want combining over non-printable characters. This includes '\n' and '\t' too. */
 503     if (!mcview_isprint (view, cs[0]))
 504         return 1;
 505
 506     if (mcview_ismark (view, cs[0]))
 507     {
 508         if (!state->print_lonely_combining)
 509         {
 510             /* First character is combining. Either just return it, ... */
 511             return 1;
 512         }
 513         else
 514         {
 515             /* or place this (and subsequent combining ones) over a dotted circle. */
 516             cs[1] = cs[0];
 517             cs[0] = BASE_CHARACTER_FOR_LONELY_COMBINING;
 518             i = 2;
 519         }
 520     }
 521
 522     if (mcview_wcwidth (view, cs[0]) == 2)
 523     {
 524         /* Don't allow combining or spacing mark for wide characters, is this okay? */
 525         return 1;
 526     }
 527
 528     /* Look for more combining chars. Either at most clen-1 zero-width combining chars,
 529      * or at most 1 spacing mark. Is this logic correct? */
 530     for (; i < clen; i++)
 531     {
 532         mcview_state_machine_t state_after_combining;
 533
 534         state_after_combining = *state;
 535         if (!mcview_get_next_maybe_nroff_char (view, &state_after_combining, &cs[i], NULL))
 536             return i;
 537         if (!mcview_ismark (view, cs[i]) || !mcview_isprint (view, cs[i]))
 538             return i;
 539         if (g_unichar_type (cs[i]) == SPACING_MARK)
 540         {
 541             /* Only allow as the first combining char. Stop processing in either case. */
 542             if (i == 1)
 543             {
 544                 *state = state_after_combining;
 545                 i++;
 546             }
 547             return i;
 548         }
 549         *state = state_after_combining;
 550     }
 551     return i;
 552 }
 553
 554 /* --------------------------------------------------------------------------------------------- */
 555 /**
 556  * Parse, format and possibly display one visual line of text.
 557  *
 558  * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
 559  * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
 560  * default state, the additional horizontal scrolling is added here. In wrap mode, this should
 561  * point to the beginning of the line, with the proper state at that point.
 562  *
 563  * In wrap mode, if a line ends in a newline, it is consumed, even if it's exactly at the right
 564  * edge. In unwrap mode, the whole remaining line, including the newline is consumed. Displaying
 565  * the next line should start at "state"'s new value, or if we displayed the bottom line then
 566  * state->offset tells the file offset to be shown in the top bar.
 567  *
 568  * If "row" is offscreen, don't actually display the line but still update "state" and return the
 569  * proper value. This is used by mcview_wrap_move_down to advance in the file.
 570  *
 571  * @param view ...
 572  * @param state the parser-formatter state machine's state, updated
 573  * @param row print to this row
 574  * @param paragraph_ended store TRUE if paragraph ended by newline or EOF, FALSE if wraps to next
 575  *   line
 576  * @param linewidth store the width of the line here
 577  * @return the number of rows, that is, 0 if we were already at EOF, otherwise 1
 578  */
 579 static int
 580 mcview_display_line (mcview_t * view, mcview_state_machine_t * state, int row,
 581                      gboolean * paragraph_ended, off_t * linewidth)
 582 {
 583     const screen_dimen left = view->data_area.left;
 584     const screen_dimen top = view->data_area.top;
 585     const screen_dimen width = view->data_area.width;
 586     const screen_dimen height = view->data_area.height;
 587     off_t dpy_text_column = view->text_wrap_mode ? 0 : view->dpy_text_column;
 588     screen_dimen col = 0;
 589     int cs[1 + MAX_COMBINING_CHARS];
 590     char str[(1 + MAX_COMBINING_CHARS) * UTF8_CHAR_LEN + 1];
 591     int i, j;
 592
 593     if (paragraph_ended != NULL)
 594         *paragraph_ended = TRUE;
 595
 596     if (!view->text_wrap_mode && (row < 0 || row >= (int) height) && linewidth == NULL)
 597     {
 598         /* Optimization: Fast forward to the end of the line, rather than carefully
 599          * parsing and then not actually displaying it. */
 600         off_t eol;
 601         int retval;
 602
 603         eol = mcview_eol (view, state->offset, mcview_get_filesize (view));
 604         retval = (eol > state->offset) ? 1 : 0;
 605
 606         mcview_state_machine_init (state, eol);
 607         return retval;
 608     }
 609
 610     while (TRUE)
 611     {
 612         int charwidth = 0;
 613         mcview_state_machine_t state_saved;
 614         int n;
 615         int color;
 616
 617         state_saved = *state;
 618         n = mcview_next_combining_char_sequence (view, state, cs, 1 + MAX_COMBINING_CHARS, &color);
 619         if (n == 0)
 620         {
 621             if (linewidth != NULL)
 622                 *linewidth = col;
 623             return (col > 0) ? 1 : 0;
 624         }
 625
 626         if (view->search_start <= state->offset && state->offset < view->search_end)
 627             color = VIEW_SELECTED_COLOR;
 628
 629         if (cs[0] == '\n')
 630         {
 631             /* New line: reset all formatting state for the next paragraph. */
 632             mcview_state_machine_init (state, state->offset);
 633             if (linewidth != NULL)
 634                 *linewidth = col;
 635             return 1;
 636         }
 637
 638         if (mcview_is_non_spacing_mark (view, cs[0]))
 639         {
 640             /* Lonely combining character. Probably leftover after too many combining chars. Just ignore. */
 641             continue;
 642         }
 643
 644         /* Nonprintable, or lonely spacing mark */
 645         if ((!mcview_isprint (view, cs[0]) || mcview_ismark (view, cs[0])) && cs[0] != '\t')
 646             cs[0] = '.';
 647
 648         for (i = 0; i < n; i++)
 649             charwidth += mcview_wcwidth (view, cs[i]);
 650
 651         /* Adjust the width for TAB. It's handled below along with the normal characters,
 652          * so that it's wrapped consistently with them, and is painted with the proper
 653          * attributes (although currently it can't have a special color). */
 654         if (cs[0] == '\t')
 655         {
 656             charwidth = option_tab_spacing - state->unwrapped_column % option_tab_spacing;
 657             state->print_lonely_combining = TRUE;
 658         }
 659         else
 660             state->print_lonely_combining = FALSE;
 661
 662         /* In wrap mode only: We're done with this row if the character sequence wouldn't fit.
 663          * Except if at the first column, because then it wouldn't fit in the next row either.
 664          * In this extreme case let the unwrapped code below do its best to display it. */
 665         if (view->text_wrap_mode && (off_t) col + charwidth > dpy_text_column + (off_t) width
 666             && col > 0)
 667         {
 668             *state = state_saved;
 669             if (paragraph_ended != NULL)
 670                 *paragraph_ended = FALSE;
 671             if (linewidth != NULL)
 672                 *linewidth = col;
 673             return 1;
 674         }
 675
 676         /* Display, unless outside of the viewport. */
 677         if (row >= 0 && row < (int) height)
 678         {
 679             if ((off_t) col >= dpy_text_column &&
 680                 (off_t) col + charwidth <= dpy_text_column + (off_t) width)
 681             {
 682                 /* The combining character sequence fits entirely in the viewport. Print it. */
 683                 tty_setcolor (color);
 684                 widget_move (view, top + row, left + ((off_t) col - dpy_text_column));
 685                 if (cs[0] == '\t')
 686                 {
 687                     for (i = 0; i < charwidth; i++)
 688                         tty_print_char (' ');
 689                 }
 690                 else
 691                 {
 692                     j = 0;
 693                     for (i = 0; i < n; i++)
 694                         j += mcview_char_display (view, cs[i], str + j);
 695                     str[j] = '\0';
 696                     /* This is probably a bug in our tty layer, but tty_print_string
 697                      * normalizes the string, whereas tty_printf doesn't. Don't normalize,
 698                      * since we handle combining characters ourselves correctly, it's
 699                      * better if they are copy-pasted correctly. Ticket 3255. */
 700                     tty_printf ("%s", str);
 701                 }
 702             }
 703             else if ((off_t) col < dpy_text_column && (off_t) col + charwidth > dpy_text_column)
 704             {
 705                 /* The combining character sequence would cross the left edge of the viewport.
 706                  * This cannot happen with wrap mode. Print replacement character(s),
 707                  * or spaces with the correct attributes for partial Tabs. */
 708                 tty_setcolor (color);
 709                 for (i = dpy_text_column;
 710                      i < (off_t) col + charwidth && i < dpy_text_column + (off_t) width; i++)
 711                 {
 712                     widget_move (view, top + row, left + (i - dpy_text_column));
 713                     tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_LEFT_MARGIN);
 714                 }
 715             }
 716             else if ((off_t) col < dpy_text_column + (off_t) width &&
 717                      (off_t) col + charwidth > dpy_text_column + (off_t) width)
 718             {
 719                 /* The combining character sequence would cross the right edge of the viewport
 720                  * and we're not wrapping. Print replacement character(s),
 721                  * or spaces with the correct attributes for partial Tabs. */
 722                 tty_setcolor (color);
 723                 for (i = col; i < dpy_text_column + (off_t) width; i++)
 724                 {
 725                     widget_move (view, top + row, left + (i - dpy_text_column));
 726                     tty_print_anychar ((cs[0] == '\t') ? ' ' : PARTIAL_CJK_AT_RIGHT_MARGIN);
 727                 }
 728             }
 729         }
 730
 731         col += charwidth;
 732         state->unwrapped_column += charwidth;
 733
 734         if (!view->text_wrap_mode && (off_t) col >= dpy_text_column + (off_t) width
 735             && linewidth == NULL)
 736         {
 737             /* Optimization: Fast forward to the end of the line, rather than carefully
 738              * parsing and then not actually displaying it. */
 739             off_t eol;
 740
 741             eol = mcview_eol (view, state->offset, mcview_get_filesize (view));
 742             mcview_state_machine_init (state, eol);
 743             return 1;
 744         }
 745     }
 746 }
 747
 748 /* --------------------------------------------------------------------------------------------- */
 749 /**
 750  * Parse, format and possibly display one paragraph (perhaps not from the beginning).
 751  *
 752  * Formatting starts at the given "state" (which encodes the file offset and parser and formatter's
 753  * internal state). In unwrap mode, this should point to the beginning of the paragraph with the
 754  * default state, the additional horizontal scrolling is added here. In wrap mode, this may point
 755  * to the beginning of the line within a paragraph (to display the partial paragraph at the top),
 756  * with the proper state at that point.
 757  *
 758  * Displaying the next paragraph should start at "state"'s new value, or if we displayed the bottom
 759  * line then state->offset tells the file offset to be shown in the top bar.
 760  *
 761  * If "row" is negative, don't display the first abs(row) lines and display the rest from the top.
 762  * This was a nice idea but it's now unused :)
 763  *
 764  * If "row" is too large, don't display the paragraph at all but still return the number of lines.
 765  * This is used when moving upwards.
 766  *
 767  * @param view ...
 768  * @param state the parser-formatter state machine's state, updated
 769  * @param row print starting at this row
 770  * @return the number of rows the paragraphs is wrapped to, that is, 0 if we were already at EOF,
 771  *   otherwise 1 in unwrap mode, >= 1 in wrap mode. We stop when reaching the bottom of the
 772  *   viewport, it's not counted how many more lines the paragraph would occupy
 773  */
 774 static int
 775 mcview_display_paragraph (mcview_t * view, mcview_state_machine_t * state, int row)
 776 {
 777     const screen_dimen height = view->data_area.height;
 778     int lines = 0;
 779
 780     while (TRUE)
 781     {
 782         gboolean paragraph_ended;
 783
 784         lines += mcview_display_line (view, state, row, &paragraph_ended, NULL);
 785         if (paragraph_ended)
 786             return lines;
 787
 788         if (row < (int) height)
 789         {
 790             row++;
 791             /* stop if bottom of screen reached */
 792             if (row >= (int) height)
 793                 return lines;
 794         }
 795     }
 796 }
 797
 798 /* --------------------------------------------------------------------------------------------- */
 799 /**
 800  * Recompute dpy_state_top from dpy_start and dpy_paragraph_skip_lines. Clamp
 801  * dpy_paragraph_skip_lines if necessary.
 802  *
 803  * This method should be called in wrap mode after changing one of the parsing or formatting
 804  * properties (e.g. window width, encoding, nroff), or when switching to wrap mode from unwrap or
 805  * hex.
 806  *
 807  * If we stayed within the same paragraph then try to keep the vertical offset within that
 808  * paragraph as well. It might happen though that the paragraph became shorter than our desired
 809  * vertical position, in that case move to its last row.
 810  */
 811 static void
 812 mcview_wrap_fixup (mcview_t * view)
 813 {
 814     int lines = view->dpy_paragraph_skip_lines;
 815
 816     if (!view->dpy_wrap_dirty)
 817         return;
 818     view->dpy_wrap_dirty = FALSE;
 819
 820     view->dpy_paragraph_skip_lines = 0;
 821     mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
 822
 823     while (lines-- != 0)
 824     {
 825         mcview_state_machine_t state_prev;
 826         gboolean paragraph_ended;
 827
 828         state_prev = view->dpy_state_top;
 829         if (mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL) == 0)
 830             break;
 831         if (paragraph_ended)
 832         {
 833             view->dpy_state_top = state_prev;
 834             break;
 835         }
 836         view->dpy_paragraph_skip_lines++;
 837     }
 838 }
 839
 840 /* --------------------------------------------------------------------------------------------- */
 841 /*** public functions ****************************************************************************/
 842 /* --------------------------------------------------------------------------------------------- */
 843
 844 /**
 845  * In both wrap and unwrap modes, dpy_start points to the beginning of the paragraph.
 846  *
 847  * In unwrap mode, start displaying from this position, probably applying an additional horizontal
 848  * scroll.
 849  *
 850  * In wrap mode, an additional dpy_paragraph_skip_lines lines are skipped from the top of this
 851  * paragraph. dpy_state_top contains the position and parser-formatter state corresponding to the
 852  * top left corner so we can just start rendering from here. Unless dpy_wrap_dirty is set in which
 853  * case dpy_state_top is invalid and we need to recompute first.
 854  */
 855 void
 856 mcview_display_text (mcview_t * view)
 857 {
 858     const screen_dimen left = view->data_area.left;
 859     const screen_dimen top = view->data_area.top;
 860     const screen_dimen height = view->data_area.height;
 861     int row;
 862     mcview_state_machine_t state;
 863     gboolean again;
 864
 865     do
 866     {
 867         int n;
 868
 869         again = FALSE;
 870
 871         mcview_display_clean (view);
 872         mcview_display_ruler (view);
 873
 874         if (!view->text_wrap_mode)
 875             mcview_state_machine_init (&state, view->dpy_start);
 876         else
 877         {
 878             mcview_wrap_fixup (view);
 879             state = view->dpy_state_top;
 880         }
 881
 882         for (row = 0; row < (int) height; row += n)
 883         {
 884             n = mcview_display_paragraph (view, &state, row);
 885             if (n == 0)
 886             {
 887                 /* In the rare case that displaying didn't start at the beginning
 888                  * of the file, yet there are some empty lines at the bottom,
 889                  * scroll the file and display again. This happens when e.g. the
 890                  * window is made bigger, or the file becomes shorter due to
 891                  * charset change or enabling nroff. */
 892                 if ((view->text_wrap_mode ? view->dpy_state_top.offset : view->dpy_start) > 0)
 893                 {
 894                     mcview_ascii_move_up (view, height - row);
 895                     again = TRUE;
 896                 }
 897                 break;
 898             }
 899         }
 900     }
 901     while (again);
 902
 903     view->dpy_end = state.offset;
 904     view->dpy_state_bottom = state;
 905
 906     tty_setcolor (VIEW_NORMAL_COLOR);
 907     if (mcview_show_eof != NULL && mcview_show_eof[0] != '\0')
 908         while (row < (int) height)
 909         {
 910             widget_move (view, top + row, left);
 911             /* TODO: should make it no wider than the viewport */
 912             tty_print_string (mcview_show_eof);
 913             row++;
 914         }
 915 }
 916
 917 /* --------------------------------------------------------------------------------------------- */
 918 /**
 919  * Move down.
 920  *
 921  * It's very simple. Just invisibly format the next "lines" lines, carefully carrying the formatter
 922  * state in wrap mode. But before each step we need to check if we've already hit the end of the
 923  * file, in that case we can no longer move. This is done by walking from dpy_state_bottom.
 924  *
 925  * Note that this relies on mcview_display_text() setting dpy_state_bottom to its correct value
 926  * upon rendering the screen contents. So don't call this function from other functions (e.g. at
 927  * the bottom of mcview_ascii_move_up()) which invalidate this value.
 928  */
 929 void
 930 mcview_ascii_move_down (mcview_t * view, off_t lines)
 931 {
 932     while (lines-- != 0)
 933     {
 934         gboolean paragraph_ended;
 935
 936         /* See if there's still data below the bottom line, by imaginarily displaying one
 937          * more line. This takes care of reading more data into growbuf, if required.
 938          * If the end position didn't advance, we're at EOF and hence bail out. */
 939         if (mcview_display_line (view, &view->dpy_state_bottom, -1, &paragraph_ended, NULL) == 0)
 940             break;
 941
 942         /* Okay, there's enough data. Move by 1 row at the top, too. No need to check for
 943          * EOF, that can't happen. */
 944         if (!view->text_wrap_mode)
 945         {
 946             view->dpy_start = mcview_eol (view, view->dpy_start, mcview_get_filesize (view));
 947             view->dpy_paragraph_skip_lines = 0;
 948             view->dpy_wrap_dirty = TRUE;
 949         }
 950         else
 951         {
 952             mcview_display_line (view, &view->dpy_state_top, -1, &paragraph_ended, NULL);
 953             if (!paragraph_ended)
 954                 view->dpy_paragraph_skip_lines++;
 955             else
 956             {
 957                 view->dpy_start = view->dpy_state_top.offset;
 958                 view->dpy_paragraph_skip_lines = 0;
 959             }
 960         }
 961     }
 962 }
 963
 964 /* --------------------------------------------------------------------------------------------- */
 965 /**
 966  * Move up.
 967  *
 968  * Unwrap mode: Piece of cake. Wrap mode: If we'd walk back more than the current line offset
 969  * within the paragraph, we need to jump back to the previous paragraph and compute its height to
 970  * see if we start from that paragraph, and repeat this if necessary. Once we're within the desired
 971  * paragraph, we still need to format it from its beginning to know the state.
 972  *
 973  * See the top of this file for comments about MAX_BACKWARDS_WALK_IN_PARAGRAPH.
 974  *
 975  * force_max is a nice protection against the rare extreme case that the file underneath us
 976  * changes, we don't want to endlessly consume a file of maybe full of zeros upon moving upwards.
 977  */
 978 void
 979 mcview_ascii_move_up (mcview_t * view, off_t lines)
 980 {
 981     if (!view->text_wrap_mode)
 982     {
 983         while (lines-- != 0)
 984             view->dpy_start = mcview_bol (view, view->dpy_start - 1, 0);
 985         view->dpy_paragraph_skip_lines = 0;
 986         view->dpy_wrap_dirty = TRUE;
 987     }
 988     else
 989     {
 990         int i;
 991
 992         while (lines > view->dpy_paragraph_skip_lines)
 993         {
 994             /* We need to go back to the previous paragraph. */
 995             if (view->dpy_start == 0)
 996             {
 997                 /* Oops, we're already in the first paragraph. */
 998                 view->dpy_paragraph_skip_lines = 0;
 999                 mcview_state_machine_init (&view->dpy_state_top, 0);
1000                 return;
1001             }
1002             lines -= view->dpy_paragraph_skip_lines;
1003             view->force_max = view->dpy_start;
1004             view->dpy_start =
1005                 mcview_bol (view, view->dpy_start - 1,
1006                             view->dpy_start - MAX_BACKWARDS_WALK_IN_PARAGRAPH);
1007             mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
1008             /* This is a tricky way of denoting that we're at the end of the paragraph.
1009              * Normally we'd jump to the next paragraph and reset paragraph_skip_lines. But for
1010              * walking backwards this is exactly what we need. */
1011             view->dpy_paragraph_skip_lines =
1012                 mcview_display_paragraph (view, &view->dpy_state_top, view->data_area.height);
1013             view->force_max = -1;
1014         }
1015
1016         /* Okay, we have have dpy_start pointing to the desired paragraph, and we still need to
1017          * walk back "lines" lines from the current "dpy_paragraph_skip_lines" offset. We can't do
1018          * that, so walk from the beginning of the paragraph. */
1019         mcview_state_machine_init (&view->dpy_state_top, view->dpy_start);
1020         view->dpy_paragraph_skip_lines -= lines;
1021         for (i = 0; i < view->dpy_paragraph_skip_lines; i++)
1022             mcview_display_line (view, &view->dpy_state_top, -1, NULL, NULL);
1023     }
1024 }
1025
1026 /* --------------------------------------------------------------------------------------------- */
1027
1028 void
1029 mcview_ascii_moveto_bol (mcview_t * view)
1030 {
1031     if (!view->text_wrap_mode)
1032         view->dpy_text_column = 0;
1033 }
1034
1035 /* --------------------------------------------------------------------------------------------- */
1036
1037 void
1038 mcview_ascii_moveto_eol (mcview_t * view)
1039 {
1040     if (!view->text_wrap_mode)
1041     {
1042         mcview_state_machine_t state;
1043         off_t linewidth;
1044
1045         /* Get the width of the topmost paragraph. */
1046         mcview_state_machine_init (&state, view->dpy_start);
1047         mcview_display_line (view, &state, -1, NULL, &linewidth);
1048         view->dpy_text_column = mcview_offset_doz (linewidth, (off_t) view->data_area.width);
1049     }
1050 }
1051
1052 /* --------------------------------------------------------------------------------------------- */
1053
1054 void
1055 mcview_state_machine_init (mcview_state_machine_t * state, off_t offset)
1056 {
1057     memset (state, 0, sizeof (*state));
1058     state->offset = offset;
1059     state->print_lonely_combining = TRUE;
1060 }
1061
1062 /* --------------------------------------------------------------------------------------------- */