beta-0.89.2
[luatex.git] / source / libs / poppler / poppler-src / poppler / TextOutputDev.cc
blobfff3f05e8302e580c9d392a112ebb72e9171b66c
1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
9 //========================================================================
11 // Modified under the Poppler project - http://poppler.freedesktop.org
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
16 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17 // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18 // Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20 // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21 // Copyright (C) 2007, 2008, 2012 Adrian Johnson <ajohnson@redneon.com>
22 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23 // Copyright (C) 2008, 2010-2012, 2014, 2015 Albert Astals Cid <aacid@kde.org>
24 // Copyright (C) 2008 Pino Toscano <pino@kde.org>
25 // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26 // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29 // Copyright (C) 2010 Marek Kasik <mkasik@redhat.com>
30 // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31 // Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32 // Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33 // Copyright (C) 2012, 2013-2015 Jason Crain <jason@aquaticape.us>
34 // Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35 // Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37 // Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38 // Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
40 // To see a description of the changes please see the Changelog file that
41 // came with your tarball or type make ChangeLog if you are building from git
43 //========================================================================
45 #include <config.h>
47 #ifdef USE_GCC_PRAGMAS
48 #pragma implementation
49 #endif
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <stddef.h>
54 #include <math.h>
55 #include <float.h>
56 #include <ctype.h>
57 #ifdef _WIN32
58 #include <fcntl.h> // for O_BINARY
59 #include <io.h> // for setmode
60 #endif
61 #include "goo/gmem.h"
62 #include "goo/GooString.h"
63 #include "goo/GooList.h"
64 #include "poppler-config.h"
65 #include "Error.h"
66 #include "GlobalParams.h"
67 #include "UnicodeMap.h"
68 #include "UnicodeTypeTable.h"
69 #include "Link.h"
70 #include "TextOutputDev.h"
71 #include "Page.h"
72 #include "Annot.h"
73 #include "UTF.h"
75 #ifdef MACOS
76 // needed for setting type/creator of MacOS files
77 #include "ICSupport.h"
78 #endif
80 //------------------------------------------------------------------------
81 // parameters
82 //------------------------------------------------------------------------
84 // Each bucket in a text pool includes baselines within a range of
85 // this many points.
86 #define textPoolStep 4
88 // Inter-character space width which will cause addChar to start a new
89 // word.
90 #define minWordBreakSpace 0.1
92 // Negative inter-character space width, i.e., overlap, which will
93 // cause addChar to start a new word.
94 #define minDupBreakOverlap 0.2
96 // Max distance between baselines of two lines within a block, as a
97 // fraction of the font size.
98 #define maxLineSpacingDelta 1.5
100 // Max difference in primary font sizes on two lines in the same
101 // block. Delta1 is used when examining new lines above and below the
102 // current block; delta2 is used when examining text that overlaps the
103 // current block; delta3 is used when examining text to the left and
104 // right of the current block.
105 #define maxBlockFontSizeDelta1 0.05
106 #define maxBlockFontSizeDelta2 0.6
107 #define maxBlockFontSizeDelta3 0.2
109 // Max difference in font sizes inside a word.
110 #define maxWordFontSizeDelta 0.05
112 // Maximum distance between baselines of two words on the same line,
113 // e.g., distance between subscript or superscript and the primary
114 // baseline, as a fraction of the font size.
115 #define maxIntraLineDelta 0.5
117 // Minimum inter-word spacing, as a fraction of the font size. (Only
118 // used for raw ordering.)
119 #define minWordSpacing 0.15
121 // Maximum inter-word spacing, as a fraction of the font size.
122 #define maxWordSpacing 1.5
124 // Maximum horizontal spacing which will allow a word to be pulled
125 // into a block.
126 #define minColSpacing1 0.3
128 // Minimum spacing between columns, as a fraction of the font size.
129 #define minColSpacing2 1.0
131 // Maximum vertical spacing between blocks within a flow, as a
132 // multiple of the font size.
133 #define maxBlockSpacing 2.5
135 // Minimum spacing between characters within a word, as a fraction of
136 // the font size.
137 #define minCharSpacing -0.5
139 // Maximum spacing between characters within a word, as a fraction of
140 // the font size, when there is no obvious extra-wide character
141 // spacing.
142 #define maxCharSpacing 0.03
144 // When extra-wide character spacing is detected, the inter-character
145 // space threshold is set to the minimum inter-character space
146 // multiplied by this constant.
147 #define maxWideCharSpacingMul 1.3
149 // Upper limit on spacing between characters in a word.
150 #define maxWideCharSpacing 0.4
152 // Max difference in primary,secondary coordinates (as a fraction of
153 // the font size) allowed for duplicated text (fake boldface, drop
154 // shadows) which is to be discarded.
155 #define dupMaxPriDelta 0.1
156 #define dupMaxSecDelta 0.2
158 // Max width of underlines (in points).
159 #define maxUnderlineWidth 3
161 // Min distance between baseline and underline (in points).
162 //~ this should be font-size-dependent
163 #define minUnderlineGap -2
165 // Max distance between baseline and underline (in points).
166 //~ this should be font-size-dependent
167 #define maxUnderlineGap 4
169 // Max horizontal distance between edge of word and start of underline
170 // (in points).
171 //~ this should be font-size-dependent
172 #define underlineSlack 1
174 // Max distance between edge of text and edge of link border
175 #define hyperlinkSlack 2
177 // Max distance between characters when combining a base character and
178 // combining character
179 #define combMaxMidDelta 0.3
180 #define combMaxBaseDelta 0.4
182 static int reorderText(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GooString *s, Unicode* u) {
183 char lre[8], rle[8], popdf[8], buf[8];
184 int lreLen = 0, rleLen = 0, popdfLen = 0, n;
185 int nCols, i, j, k;
187 nCols = 0;
189 if (s) {
190 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
191 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
192 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
195 if (primaryLR) {
196 i = 0;
197 while (i < len) {
198 // output a left-to-right section
199 for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
200 for (k = i; k < j; ++k) {
201 if (s) {
202 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
203 s->append(buf, n);
205 if (u) u[nCols] = text[k];
206 ++nCols;
208 i = j;
209 // output a right-to-left section
210 for (j = i;
211 j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
212 ++j) ;
213 if (j > i) {
214 if (s) s->append(rle, rleLen);
215 for (k = j - 1; k >= i; --k) {
216 if (s) {
217 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
218 s->append(buf, n);
220 if (u) u[nCols] = text[k];
221 ++nCols;
223 if (s) s->append(popdf, popdfLen);
224 i = j;
227 } else {
228 // Note: This code treats numeric characters (European and
229 // Arabic/Indic) as left-to-right, which isn't strictly correct
230 // (incurs extra LRE/POPDF pairs), but does produce correct
231 // visual formatting.
232 if (s) s->append(rle, rleLen);
233 i = len - 1;
234 while (i >= 0) {
235 // output a right-to-left section
236 for (j = i;
237 j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
238 --j) ;
239 for (k = i; k > j; --k) {
240 if (s) {
241 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
242 s->append(buf, n);
244 if (u) u[nCols] = text[k];
245 ++nCols;
247 i = j;
248 // output a left-to-right section
249 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
250 if (j < i) {
251 if (s) s->append(lre, lreLen);
252 for (k = j + 1; k <= i; ++k) {
253 if (s) {
254 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
255 s->append(buf, n);
257 if (u) u[nCols] = text[k];
258 ++nCols;
260 if (s) s->append(popdf, popdfLen);
261 i = j;
264 if (s) s->append(popdf, popdfLen);
267 return nCols;
270 //------------------------------------------------------------------------
271 // TextUnderline
272 //------------------------------------------------------------------------
274 class TextUnderline {
275 public:
277 TextUnderline(double x0A, double y0A, double x1A, double y1A)
278 { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
279 ~TextUnderline() {}
281 double x0, y0, x1, y1;
282 GBool horiz;
285 //------------------------------------------------------------------------
286 // TextLink
287 //------------------------------------------------------------------------
289 class TextLink {
290 public:
292 TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
293 { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; }
294 ~TextLink() {}
296 int xMin, yMin, xMax, yMax;
297 AnnotLink *link;
300 //------------------------------------------------------------------------
301 // TextFontInfo
302 //------------------------------------------------------------------------
304 TextFontInfo::TextFontInfo(GfxState *state) {
305 gfxFont = state->getFont();
306 if (gfxFont)
307 gfxFont->incRefCnt();
308 #if TEXTOUT_WORD_LIST
309 fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy()
310 : (GooString *)NULL;
311 flags = gfxFont ? gfxFont->getFlags() : 0;
312 #endif
315 TextFontInfo::~TextFontInfo() {
316 if (gfxFont)
317 gfxFont->decRefCnt();
318 #if TEXTOUT_WORD_LIST
319 if (fontName) {
320 delete fontName;
322 #endif
325 GBool TextFontInfo::matches(GfxState *state) {
326 return state->getFont() == gfxFont;
329 GBool TextFontInfo::matches(TextFontInfo *fontInfo) {
330 return gfxFont == fontInfo->gfxFont;
333 double TextFontInfo::getAscent() {
334 return gfxFont ? gfxFont->getAscent() : 0.95;
337 double TextFontInfo::getDescent() {
338 return gfxFont ? gfxFont->getDescent() : -0.35;
341 int TextFontInfo::getWMode() {
342 return gfxFont ? gfxFont->getWMode() : 0;
345 //------------------------------------------------------------------------
346 // TextWord
347 //------------------------------------------------------------------------
349 TextWord::TextWord(GfxState *state, int rotA, double fontSizeA) {
350 rot = rotA;
351 fontSize = fontSizeA;
352 text = NULL;
353 charcode = NULL;
354 edge = NULL;
355 charPos = NULL;
356 font = NULL;
357 textMat = NULL;
358 len = size = 0;
359 spaceAfter = gFalse;
360 next = NULL;
362 #if TEXTOUT_WORD_LIST
363 GfxRGB rgb;
365 if ((state->getRender() & 3) == 1) {
366 state->getStrokeRGB(&rgb);
367 } else {
368 state->getFillRGB(&rgb);
370 colorR = colToDbl(rgb.r);
371 colorG = colToDbl(rgb.g);
372 colorB = colToDbl(rgb.b);
373 #endif
375 underlined = gFalse;
376 link = NULL;
379 TextWord::~TextWord() {
380 gfree(text);
381 gfree(charcode);
382 gfree(edge);
383 gfree(charPos);
384 gfree(font);
385 gfree(textMat);
388 void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
389 double dx, double dy, int charPosA, int charLen,
390 CharCode c, Unicode u, const Matrix &textMatA) {
391 ensureCapacity(len+1);
392 text[len] = u;
393 charcode[len] = c;
394 charPos[len] = charPosA;
395 charPos[len + 1] = charPosA + charLen;
396 font[len] = fontA;
397 textMat[len] = textMatA;
399 if (len == 0)
400 setInitialBounds(fontA, x, y);
402 if (wMode) { // vertical writing mode
403 // NB: the rotation value has been incremented by 1 (in
404 // TextPage::beginWord()) for vertical writing mode
405 switch (rot) {
406 case 0:
407 edge[len] = x - fontSize;
408 xMax = edge[len+1] = x;
409 break;
410 case 1:
411 edge[len] = y - fontSize;
412 yMax = edge[len+1] = y;
413 break;
414 case 2:
415 edge[len] = x + fontSize;
416 xMin = edge[len+1] = x;
417 break;
418 case 3:
419 edge[len] = y + fontSize;
420 yMin = edge[len+1] = y;
421 break;
423 } else { // horizontal writing mode
424 switch (rot) {
425 case 0:
426 edge[len] = x;
427 xMax = edge[len+1] = x + dx;
428 break;
429 case 1:
430 edge[len] = y;
431 yMax = edge[len+1] = y + dy;
432 break;
433 case 2:
434 edge[len] = x;
435 xMin = edge[len+1] = x + dx;
436 break;
437 case 3:
438 edge[len] = y;
439 yMin = edge[len+1] = y + dy;
440 break;
443 ++len;
446 void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y) {
447 double ascent = fontA->getAscent() * fontSize;
448 double descent = fontA->getDescent() * fontSize;
449 wMode = fontA->getWMode();
451 if (wMode) { // vertical writing mode
452 // NB: the rotation value has been incremented by 1 (in
453 // TextPage::beginWord()) for vertical writing mode
454 switch (rot) {
455 case 0:
456 xMin = x - fontSize;
457 yMin = y - fontSize;
458 yMax = y;
459 base = y;
460 break;
461 case 1:
462 xMin = x;
463 yMin = y - fontSize;
464 xMax = x + fontSize;
465 base = x;
466 break;
467 case 2:
468 yMin = y;
469 xMax = x + fontSize;
470 yMax = y + fontSize;
471 base = y;
472 break;
473 case 3:
474 xMin = x - fontSize;
475 xMax = x;
476 yMax = y + fontSize;
477 base = x;
478 break;
480 } else { // horizontal writing mode
481 switch (rot) {
482 case 0:
483 xMin = x;
484 yMin = y - ascent;
485 yMax = y - descent;
486 if (yMin == yMax) {
487 // this is a sanity check for a case that shouldn't happen -- but
488 // if it does happen, we want to avoid dividing by zero later
489 yMin = y;
490 yMax = y + 1;
492 base = y;
493 break;
494 case 1:
495 xMin = x + descent;
496 yMin = y;
497 xMax = x + ascent;
498 if (xMin == xMax) {
499 // this is a sanity check for a case that shouldn't happen -- but
500 // if it does happen, we want to avoid dividing by zero later
501 xMin = x;
502 xMax = x + 1;
504 base = x;
505 break;
506 case 2:
507 yMin = y + descent;
508 xMax = x;
509 yMax = y + ascent;
510 if (yMin == yMax) {
511 // this is a sanity check for a case that shouldn't happen -- but
512 // if it does happen, we want to avoid dividing by zero later
513 yMin = y;
514 yMax = y + 1;
516 base = y;
517 break;
518 case 3:
519 xMin = x - ascent;
520 xMax = x - descent;
521 yMax = y;
522 if (xMin == xMax) {
523 // this is a sanity check for a case that shouldn't happen -- but
524 // if it does happen, we want to avoid dividing by zero later
525 xMin = x;
526 xMax = x + 1;
528 base = x;
529 break;
534 void TextWord::ensureCapacity(int capacity) {
535 if (capacity > size) {
536 size = std::max(size + 16, capacity);
537 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
538 charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
539 edge = (double *)greallocn(edge, (size + 1), sizeof(double));
540 charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
541 font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
542 textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
546 struct CombiningTable {
547 Unicode base;
548 Unicode comb;
551 static struct CombiningTable combiningTable[] = {
552 {0x0060, 0x0300}, // grave
553 {0x00a8, 0x0308}, // dieresis
554 {0x00af, 0x0304}, // macron
555 {0x00b4, 0x0301}, // acute
556 {0x00b8, 0x0327}, // cedilla
557 {0x02c6, 0x0302}, // circumflex
558 {0x02c7, 0x030c}, // caron
559 {0x02d8, 0x0306}, // breve
560 {0x02d9, 0x0307}, // dotaccent
561 {0x02da, 0x030a}, // ring
562 {0x02dc, 0x0303}, // tilde
563 {0x02dd, 0x030b} // hungarumlaut (double acute accent)
566 // returning combining versions of characters
567 Unicode getCombiningChar(Unicode u) {
568 int len = sizeof(combiningTable) / sizeof(combiningTable[0]);
569 for (int i = 0; i < len; ++i) {
570 if (u == combiningTable[i].base)
571 return combiningTable[i].comb;
573 return 0;
576 GBool TextWord::addCombining(GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y,
577 double dx, double dy, int charPosA, int charLen,
578 CharCode c, Unicode u, const Matrix &textMatA) {
579 if (len == 0 || wMode != 0 || fontA->getWMode() != 0)
580 return gFalse;
582 Unicode cCurrent = getCombiningChar(u);
583 Unicode cPrev = getCombiningChar(text[len-1]);
584 double edgeMid = (edge[len-1] + edge[len]) / 2;
585 double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta;
587 if (cCurrent != 0 && unicodeTypeAlphaNum(text[len-1])) {
588 // Current is a combining character, previous is base character
589 maxScaledMidDelta = fabs(edge[len] - edge[len-1]) * combMaxMidDelta;
590 charMid = charBase = maxScaledBaseDelta = 0;
592 // Test if characters overlap
593 if (rot == 0 || rot == 2) {
594 charMid = x + (dx / 2);
595 charBase = y;
596 maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
597 } else {
598 charMid = y + (dy / 2);
599 charBase = x;
600 maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
603 if (fabs(charMid - edgeMid) >= maxScaledMidDelta ||
604 fabs(charBase - base) >= maxScaledBaseDelta)
605 return gFalse;
607 // Add character, but don't adjust edge / bounding box because
608 // combining character's positioning could be odd.
609 ensureCapacity(len+1);
610 text[len] = cCurrent;
611 charcode[len] = c;
612 charPos[len] = charPosA;
613 charPos[len+1] = charPosA + charLen;
614 font[len] = fontA;
615 textMat[len] = textMatA;
616 edge[len+1] = edge[len];
617 edge[len] = (edge[len+1] + edge[len-1]) / 2;
618 ++len;
619 return gTrue;
622 if (cPrev != 0 && unicodeTypeAlphaNum(u)) {
623 // Previous is a combining character, current is base character
624 maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
625 charMid = charBase = maxScaledMidDelta = 0;
627 // Test if characters overlap
628 if (rot == 0 || rot == 2) {
629 charMid = x + (dx / 2);
630 charBase = y;
631 maxScaledMidDelta = fabs(dx * combMaxMidDelta);
632 } else {
633 charMid = y + (dy / 2);
634 charBase = x;
635 maxScaledMidDelta = fabs(dy * combMaxMidDelta);
638 if (fabs(charMid - edgeMid) >= maxScaledMidDelta ||
639 fabs(charBase - base) >= maxScaledBaseDelta)
640 return gFalse;
642 // move combining character to after base character
643 ensureCapacity(len+1);
644 fontSize = fontSizeA;
645 text[len] = cPrev;
646 charcode[len] = charcode[len-1];
647 charPos[len] = charPosA;
648 charPos[len+1] = charPosA + charLen;
649 font[len] = font[len-1];
650 textMat[len] = textMat[len-1];
652 text[len-1] = u;
653 charcode[len-1] = c;
654 font[len-1] = fontA;
655 textMat[len-1] = textMatA;
657 if (len == 1)
658 setInitialBounds(fontA, x, y);
660 // Updated edges / bounding box because we changed the base
661 // character.
662 if (wMode) {
663 switch (rot) {
664 case 0:
665 edge[len-1] = x - fontSize;
666 xMax = edge[len+1] = x;
667 break;
668 case 1:
669 edge[len-1] = y - fontSize;
670 yMax = edge[len+1] = y;
671 break;
672 case 2:
673 edge[len-1] = x + fontSize;
674 xMin = edge[len+1] = x;
675 break;
676 case 3:
677 edge[len-1] = y + fontSize;
678 yMin = edge[len+1] = y;
679 break;
681 } else {
682 switch (rot) {
683 case 0:
684 edge[len-1] = x;
685 xMax = edge[len+1] = x + dx;
686 break;
687 case 1:
688 edge[len-1] = y;
689 yMax = edge[len+1] = y + dy;
690 break;
691 case 2:
692 edge[len-1] = x;
693 xMin = edge[len+1] = x + dx;
694 break;
695 case 3:
696 edge[len-1] = y;
697 yMin = edge[len+1] = y + dy;
698 break;
702 edge[len] = (edge[len+1] + edge[len-1]) / 2;
703 ++len;
704 return gTrue;
706 return gFalse;
709 void TextWord::merge(TextWord *word) {
710 int i;
712 if (word->xMin < xMin) {
713 xMin = word->xMin;
715 if (word->yMin < yMin) {
716 yMin = word->yMin;
718 if (word->xMax > xMax) {
719 xMax = word->xMax;
721 if (word->yMax > yMax) {
722 yMax = word->yMax;
724 ensureCapacity(len + word->len);
725 for (i = 0; i < word->len; ++i) {
726 text[len + i] = word->text[i];
727 charcode[len + i] = word->charcode[i];
728 edge[len + i] = word->edge[i];
729 charPos[len + i] = word->charPos[i];
730 font[len + i] = word->font[i];
731 textMat[len + i] = word->textMat[i];
733 edge[len + word->len] = word->edge[word->len];
734 charPos[len + word->len] = word->charPos[word->len];
735 len += word->len;
738 inline int TextWord::primaryCmp(TextWord *word) {
739 double cmp;
741 cmp = 0; // make gcc happy
742 switch (rot) {
743 case 0:
744 cmp = xMin - word->xMin;
745 break;
746 case 1:
747 cmp = yMin - word->yMin;
748 break;
749 case 2:
750 cmp = word->xMax - xMax;
751 break;
752 case 3:
753 cmp = word->yMax - yMax;
754 break;
756 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
759 double TextWord::primaryDelta(TextWord *word) {
760 double delta;
762 delta = 0; // make gcc happy
763 switch (rot) {
764 case 0:
765 delta = word->xMin - xMax;
766 break;
767 case 1:
768 delta = word->yMin - yMax;
769 break;
770 case 2:
771 delta = xMin - word->xMax;
772 break;
773 case 3:
774 delta = yMin - word->yMax;
775 break;
777 return delta;
780 int TextWord::cmpYX(const void *p1, const void *p2) {
781 TextWord *word1 = *(TextWord **)p1;
782 TextWord *word2 = *(TextWord **)p2;
783 double cmp;
785 cmp = word1->yMin - word2->yMin;
786 if (cmp == 0) {
787 cmp = word1->xMin - word2->xMin;
789 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
792 #if TEXTOUT_WORD_LIST
794 GooString *TextWord::getText() {
795 GooString *s;
796 UnicodeMap *uMap;
797 char buf[8];
798 int n, i;
800 s = new GooString();
801 if (!(uMap = globalParams->getTextEncoding())) {
802 return s;
804 for (i = 0; i < len; ++i) {
805 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
806 s->append(buf, n);
808 uMap->decRefCnt();
809 return s;
812 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
813 double *xMaxA, double *yMaxA) {
814 if (charIdx < 0 || charIdx >= len) {
815 return;
817 switch (rot) {
818 case 0:
819 *xMinA = edge[charIdx];
820 *xMaxA = edge[charIdx + 1];
821 *yMinA = yMin;
822 *yMaxA = yMax;
823 break;
824 case 1:
825 *xMinA = xMin;
826 *xMaxA = xMax;
827 *yMinA = edge[charIdx];
828 *yMaxA = edge[charIdx + 1];
829 break;
830 case 2:
831 *xMinA = edge[charIdx + 1];
832 *xMaxA = edge[charIdx];
833 *yMinA = yMin;
834 *yMaxA = yMax;
835 break;
836 case 3:
837 *xMinA = xMin;
838 *xMaxA = xMax;
839 *yMinA = edge[charIdx + 1];
840 *yMaxA = edge[charIdx];
841 break;
845 #endif // TEXTOUT_WORD_LIST
847 //------------------------------------------------------------------------
848 // TextPool
849 //------------------------------------------------------------------------
851 TextPool::TextPool() {
852 minBaseIdx = 0;
853 maxBaseIdx = -1;
854 pool = NULL;
855 cursor = NULL;
856 cursorBaseIdx = -1;
859 TextPool::~TextPool() {
860 int baseIdx;
861 TextWord *word, *word2;
863 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
864 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
865 word2 = word->next;
866 delete word;
869 gfree(pool);
872 int TextPool::getBaseIdx(double base) {
873 int baseIdx;
875 baseIdx = (int)(base / textPoolStep);
876 if (baseIdx < minBaseIdx) {
877 return minBaseIdx;
879 if (baseIdx > maxBaseIdx) {
880 return maxBaseIdx;
882 return baseIdx;
885 void TextPool::addWord(TextWord *word) {
886 TextWord **newPool;
887 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
888 TextWord *w0, *w1;
890 // expand the array if needed
891 if (unlikely((word->base / textPoolStep) > INT_MAX)) {
892 error(errSyntaxWarning, -1, "word->base / textPoolStep > INT_MAX");
893 return;
895 wordBaseIdx = (int)(word->base / textPoolStep);
896 if (minBaseIdx > maxBaseIdx) {
897 minBaseIdx = wordBaseIdx - 128;
898 maxBaseIdx = wordBaseIdx + 128;
899 pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
900 sizeof(TextWord *));
901 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
902 pool[baseIdx - minBaseIdx] = NULL;
904 } else if (wordBaseIdx < minBaseIdx) {
905 newMinBaseIdx = wordBaseIdx - 128;
906 newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
907 sizeof(TextWord *));
908 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
909 newPool[baseIdx - newMinBaseIdx] = NULL;
911 memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
912 (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
913 gfree(pool);
914 pool = newPool;
915 minBaseIdx = newMinBaseIdx;
916 } else if (wordBaseIdx > maxBaseIdx) {
917 newMaxBaseIdx = wordBaseIdx + 128;
918 pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
919 sizeof(TextWord *));
920 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
921 pool[baseIdx - minBaseIdx] = NULL;
923 maxBaseIdx = newMaxBaseIdx;
926 // insert the new word
927 if (cursor && wordBaseIdx == cursorBaseIdx &&
928 word->primaryCmp(cursor) >= 0) {
929 w0 = cursor;
930 w1 = cursor->next;
931 } else {
932 w0 = NULL;
933 w1 = pool[wordBaseIdx - minBaseIdx];
935 for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
936 word->next = w1;
937 if (w0) {
938 w0->next = word;
939 } else {
940 pool[wordBaseIdx - minBaseIdx] = word;
942 cursor = word;
943 cursorBaseIdx = wordBaseIdx;
946 //------------------------------------------------------------------------
947 // TextLine
948 //------------------------------------------------------------------------
950 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
951 blk = blkA;
952 rot = rotA;
953 base = baseA;
954 words = lastWord = NULL;
955 text = NULL;
956 edge = NULL;
957 col = NULL;
958 len = 0;
959 convertedLen = 0;
960 hyphenated = gFalse;
961 next = NULL;
962 xMin = yMin = 0;
963 xMax = yMax = -1;
964 normalized = NULL;
965 normalized_len = 0;
966 normalized_idx = NULL;
969 TextLine::~TextLine() {
970 TextWord *word;
972 while (words) {
973 word = words;
974 words = words->next;
975 delete word;
977 gfree(text);
978 gfree(edge);
979 gfree(col);
980 if (normalized) {
981 gfree(normalized);
982 gfree(normalized_idx);
986 void TextLine::addWord(TextWord *word) {
987 if (lastWord) {
988 lastWord->next = word;
989 } else {
990 words = word;
992 lastWord = word;
994 if (xMin > xMax) {
995 xMin = word->xMin;
996 xMax = word->xMax;
997 yMin = word->yMin;
998 yMax = word->yMax;
999 } else {
1000 if (word->xMin < xMin) {
1001 xMin = word->xMin;
1003 if (word->xMax > xMax) {
1004 xMax = word->xMax;
1006 if (word->yMin < yMin) {
1007 yMin = word->yMin;
1009 if (word->yMax > yMax) {
1010 yMax = word->yMax;
1015 double TextLine::primaryDelta(TextLine *line) {
1016 double delta;
1018 delta = 0; // make gcc happy
1019 switch (rot) {
1020 case 0:
1021 delta = line->xMin - xMax;
1022 break;
1023 case 1:
1024 delta = line->yMin - yMax;
1025 break;
1026 case 2:
1027 delta = xMin - line->xMax;
1028 break;
1029 case 3:
1030 delta = yMin - line->yMax;
1031 break;
1033 return delta;
1036 int TextLine::primaryCmp(TextLine *line) {
1037 double cmp;
1039 cmp = 0; // make gcc happy
1040 switch (rot) {
1041 case 0:
1042 cmp = xMin - line->xMin;
1043 break;
1044 case 1:
1045 cmp = yMin - line->yMin;
1046 break;
1047 case 2:
1048 cmp = line->xMax - xMax;
1049 break;
1050 case 3:
1051 cmp = line->yMax - yMax;
1052 break;
1054 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1057 int TextLine::secondaryCmp(TextLine *line) {
1058 double cmp;
1060 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
1061 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1064 int TextLine::cmpYX(TextLine *line) {
1065 int cmp;
1067 if ((cmp = secondaryCmp(line))) {
1068 return cmp;
1070 return primaryCmp(line);
1073 int TextLine::cmpXY(const void *p1, const void *p2) {
1074 TextLine *line1 = *(TextLine **)p1;
1075 TextLine *line2 = *(TextLine **)p2;
1076 int cmp;
1078 if ((cmp = line1->primaryCmp(line2))) {
1079 return cmp;
1081 return line1->secondaryCmp(line2);
1084 void TextLine::coalesce(UnicodeMap *uMap) {
1085 TextWord *word0, *word1;
1086 double space, delta, minSpace;
1087 GBool isUnicode;
1088 char buf[8];
1089 int i, j;
1091 if (words->next) {
1093 // compute the inter-word space threshold
1094 if (words->len > 1 || words->next->len > 1) {
1095 minSpace = 0;
1096 } else {
1097 minSpace = words->primaryDelta(words->next);
1098 for (word0 = words->next, word1 = word0->next;
1099 word1 && minSpace > 0;
1100 word0 = word1, word1 = word0->next) {
1101 if (word1->len > 1) {
1102 minSpace = 0;
1104 delta = word0->primaryDelta(word1);
1105 if (delta < minSpace) {
1106 minSpace = delta;
1110 if (minSpace <= 0) {
1111 space = maxCharSpacing * words->fontSize;
1112 } else {
1113 space = maxWideCharSpacingMul * minSpace;
1114 if (space > maxWideCharSpacing * words->fontSize) {
1115 space = maxWideCharSpacing * words->fontSize;
1119 // merge words
1120 word0 = words;
1121 word1 = words->next;
1122 while (word1) {
1123 if (word0->primaryDelta(word1) >= space) {
1124 word0->spaceAfter = gTrue;
1125 word0 = word1;
1126 word1 = word1->next;
1127 } else if (word0->font[word0->len - 1] == word1->font[0] &&
1128 word0->underlined == word1->underlined &&
1129 fabs(word0->fontSize - word1->fontSize) <
1130 maxWordFontSizeDelta * words->fontSize &&
1131 word1->charPos[0] == word0->charPos[word0->len]) {
1132 word0->merge(word1);
1133 word0->next = word1->next;
1134 delete word1;
1135 word1 = word0->next;
1136 } else {
1137 word0 = word1;
1138 word1 = word1->next;
1143 // build the line text
1144 isUnicode = uMap ? uMap->isUnicode() : gFalse;
1145 len = 0;
1146 for (word1 = words; word1; word1 = word1->next) {
1147 len += word1->len;
1148 if (word1->spaceAfter) {
1149 ++len;
1152 text = (Unicode *)gmallocn(len, sizeof(Unicode));
1153 edge = (double *)gmallocn(len + 1, sizeof(double));
1154 i = 0;
1155 for (word1 = words; word1; word1 = word1->next) {
1156 for (j = 0; j < word1->len; ++j) {
1157 text[i] = word1->text[j];
1158 edge[i] = word1->edge[j];
1159 ++i;
1161 edge[i] = word1->edge[word1->len];
1162 if (word1->spaceAfter) {
1163 text[i] = (Unicode)0x0020;
1164 ++i;
1168 // compute convertedLen and set up the col array
1169 col = (int *)gmallocn(len + 1, sizeof(int));
1170 convertedLen = 0;
1171 for (i = 0; i < len; ++i) {
1172 col[i] = convertedLen;
1173 if (isUnicode) {
1174 ++convertedLen;
1175 } else if (uMap) {
1176 convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
1179 col[len] = convertedLen;
1181 // check for hyphen at end of line
1182 //~ need to check for other chars used as hyphens
1183 hyphenated = text[len - 1] == (Unicode)'-';
1186 //------------------------------------------------------------------------
1187 // TextLineFrag
1188 //------------------------------------------------------------------------
1190 class TextLineFrag {
1191 public:
1193 TextLine *line; // the line object
1194 int start, len; // offset and length of this fragment
1195 // (in Unicode chars)
1196 double xMin, xMax; // bounding box coordinates
1197 double yMin, yMax;
1198 double base; // baseline virtual coordinate
1199 int col; // first column
1201 void init(TextLine *lineA, int startA, int lenA);
1202 void computeCoords(GBool oneRot);
1204 static int cmpYXPrimaryRot(const void *p1, const void *p2);
1205 static int cmpYXLineRot(const void *p1, const void *p2);
1206 static int cmpXYLineRot(const void *p1, const void *p2);
1207 static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
1208 static int cmpXYColumnLineRot(const void *p1, const void *p2);
1211 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
1212 line = lineA;
1213 start = startA;
1214 len = lenA;
1215 col = line->col[start];
1218 void TextLineFrag::computeCoords(GBool oneRot) {
1219 TextBlock *blk;
1220 double d0, d1, d2, d3, d4;
1222 if (oneRot) {
1224 switch (line->rot) {
1225 case 0:
1226 xMin = line->edge[start];
1227 xMax = line->edge[start + len];
1228 yMin = line->yMin;
1229 yMax = line->yMax;
1230 break;
1231 case 1:
1232 xMin = line->xMin;
1233 xMax = line->xMax;
1234 yMin = line->edge[start];
1235 yMax = line->edge[start + len];
1236 break;
1237 case 2:
1238 xMin = line->edge[start + len];
1239 xMax = line->edge[start];
1240 yMin = line->yMin;
1241 yMax = line->yMax;
1242 break;
1243 case 3:
1244 xMin = line->xMin;
1245 xMax = line->xMax;
1246 yMin = line->edge[start + len];
1247 yMax = line->edge[start];
1248 break;
1250 base = line->base;
1252 } else {
1254 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
1256 xMin = line->edge[start];
1257 xMax = line->edge[start + len];
1258 yMin = line->yMin;
1259 yMax = line->yMax;
1260 base = line->base;
1262 } else {
1264 blk = line->blk;
1265 d0 = line->edge[start];
1266 d1 = line->edge[start + len];
1267 d2 = d3 = d4 = 0; // make gcc happy
1269 switch (line->rot) {
1270 case 0:
1271 d2 = line->yMin;
1272 d3 = line->yMax;
1273 d4 = line->base;
1274 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
1275 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
1276 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
1277 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
1278 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
1279 break;
1280 case 1:
1281 d2 = line->xMax;
1282 d3 = line->xMin;
1283 d4 = line->base;
1284 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
1285 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
1286 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
1287 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
1288 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
1289 break;
1290 case 2:
1291 d2 = line->yMax;
1292 d3 = line->yMin;
1293 d4 = line->base;
1294 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
1295 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
1296 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
1297 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
1298 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
1299 break;
1300 case 3:
1301 d2 = line->xMin;
1302 d3 = line->xMax;
1303 d4 = line->base;
1304 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
1305 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
1306 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
1307 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
1308 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
1309 break;
1312 switch (line->blk->page->primaryRot) {
1313 case 0:
1314 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
1315 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
1316 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
1317 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
1318 base = blk->yMin + d4 * (blk->yMax - blk->yMin);
1319 break;
1320 case 1:
1321 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
1322 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
1323 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
1324 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
1325 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
1326 break;
1327 case 2:
1328 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
1329 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
1330 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
1331 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
1332 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
1333 break;
1334 case 3:
1335 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
1336 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
1337 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
1338 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
1339 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
1340 break;
1347 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
1348 TextLineFrag *frag1 = (TextLineFrag *)p1;
1349 TextLineFrag *frag2 = (TextLineFrag *)p2;
1350 double cmp;
1352 cmp = 0; // make gcc happy
1353 switch (frag1->line->blk->page->primaryRot) {
1354 case 0:
1355 if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
1356 cmp = frag1->xMin - frag2->xMin;
1358 break;
1359 case 1:
1360 if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
1361 cmp = frag1->yMin - frag2->yMin;
1363 break;
1364 case 2:
1365 if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
1366 cmp = frag2->xMax - frag1->xMax;
1368 break;
1369 case 3:
1370 if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
1371 cmp = frag2->yMax - frag1->yMax;
1373 break;
1375 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1378 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
1379 TextLineFrag *frag1 = (TextLineFrag *)p1;
1380 TextLineFrag *frag2 = (TextLineFrag *)p2;
1381 double cmp;
1383 cmp = 0; // make gcc happy
1384 switch (frag1->line->rot) {
1385 case 0:
1386 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1387 cmp = frag1->xMin - frag2->xMin;
1389 break;
1390 case 1:
1391 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1392 cmp = frag1->yMin - frag2->yMin;
1394 break;
1395 case 2:
1396 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1397 cmp = frag2->xMax - frag1->xMax;
1399 break;
1400 case 3:
1401 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1402 cmp = frag2->yMax - frag1->yMax;
1404 break;
1406 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1409 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
1410 TextLineFrag *frag1 = (TextLineFrag *)p1;
1411 TextLineFrag *frag2 = (TextLineFrag *)p2;
1412 double cmp;
1414 cmp = 0; // make gcc happy
1415 switch (frag1->line->rot) {
1416 case 0:
1417 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1418 cmp = frag1->yMin - frag2->yMin;
1420 break;
1421 case 1:
1422 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1423 cmp = frag2->xMax - frag1->xMax;
1425 break;
1426 case 2:
1427 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1428 cmp = frag2->yMin - frag1->yMin;
1430 break;
1431 case 3:
1432 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1433 cmp = frag1->xMax - frag2->xMax;
1435 break;
1437 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1440 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) {
1441 TextLineFrag *frag1 = (TextLineFrag *)p1;
1442 TextLineFrag *frag2 = (TextLineFrag *)p2;
1443 double cmp;
1445 // if columns overlap, compare y values
1446 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1447 frag2->line->col[frag2->start]) &&
1448 frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1449 frag1->line->col[frag1->start])) {
1450 cmp = 0; // make gcc happy
1451 switch (frag1->line->blk->page->primaryRot) {
1452 case 0: cmp = frag1->yMin - frag2->yMin; break;
1453 case 1: cmp = frag2->xMax - frag1->xMax; break;
1454 case 2: cmp = frag2->yMin - frag1->yMin; break;
1455 case 3: cmp = frag1->xMax - frag2->xMax; break;
1457 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1460 // otherwise, compare starting column
1461 return frag1->col - frag2->col;
1464 int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) {
1465 TextLineFrag *frag1 = (TextLineFrag *)p1;
1466 TextLineFrag *frag2 = (TextLineFrag *)p2;
1467 double cmp;
1469 // if columns overlap, compare y values
1470 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1471 frag2->line->col[frag2->start]) &&
1472 frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1473 frag1->line->col[frag1->start])) {
1474 cmp = 0; // make gcc happy
1475 switch (frag1->line->rot) {
1476 case 0: cmp = frag1->yMin - frag2->yMin; break;
1477 case 1: cmp = frag2->xMax - frag1->xMax; break;
1478 case 2: cmp = frag2->yMin - frag1->yMin; break;
1479 case 3: cmp = frag1->xMax - frag2->xMax; break;
1481 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1484 // otherwise, compare starting column
1485 return frag1->col - frag2->col;
1488 //------------------------------------------------------------------------
1489 // TextBlock
1490 //------------------------------------------------------------------------
1492 TextBlock::TextBlock(TextPage *pageA, int rotA) {
1493 page = pageA;
1494 rot = rotA;
1495 xMin = yMin = 0;
1496 xMax = yMax = -1;
1497 priMin = 0;
1498 priMax = page->pageWidth;
1499 pool = new TextPool();
1500 lines = NULL;
1501 curLine = NULL;
1502 next = NULL;
1503 stackNext = NULL;
1504 tableId = -1;
1505 tableEnd = gFalse;
1508 TextBlock::~TextBlock() {
1509 TextLine *line;
1511 delete pool;
1512 while (lines) {
1513 line = lines;
1514 lines = lines->next;
1515 delete line;
1519 void TextBlock::addWord(TextWord *word) {
1520 pool->addWord(word);
1521 if (xMin > xMax) {
1522 xMin = word->xMin;
1523 xMax = word->xMax;
1524 yMin = word->yMin;
1525 yMax = word->yMax;
1526 } else {
1527 if (word->xMin < xMin) {
1528 xMin = word->xMin;
1530 if (word->xMax > xMax) {
1531 xMax = word->xMax;
1533 if (word->yMin < yMin) {
1534 yMin = word->yMin;
1536 if (word->yMax > yMax) {
1537 yMax = word->yMax;
1542 void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) {
1543 TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1544 TextLine *line, *line0, *line1;
1545 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1546 int baseIdx, bestWordBaseIdx, idx0, idx1;
1547 double minBase, maxBase;
1548 double fontSize, wordSpacing, delta, priDelta, secDelta;
1549 TextLine **lineArray;
1550 GBool found, overlap;
1551 int col1, col2;
1552 int i, j, k;
1554 // discard duplicated text (fake boldface, drop shadows)
1555 for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1556 word0 = pool->getPool(idx0);
1557 while (word0) {
1558 priDelta = dupMaxPriDelta * word0->fontSize;
1559 secDelta = dupMaxSecDelta * word0->fontSize;
1560 maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1561 found = gFalse;
1562 word1 = word2 = NULL; // make gcc happy
1563 for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1564 if (idx1 == idx0) {
1565 word1 = word0;
1566 word2 = word0->next;
1567 } else {
1568 word1 = NULL;
1569 word2 = pool->getPool(idx1);
1571 for (; word2; word1 = word2, word2 = word2->next) {
1572 if (word2->len == word0->len &&
1573 !memcmp(word2->text, word0->text,
1574 word0->len * sizeof(Unicode))) {
1575 switch (rot) {
1576 case 0:
1577 case 2:
1578 found = fabs(word0->xMin - word2->xMin) < priDelta &&
1579 fabs(word0->xMax - word2->xMax) < priDelta &&
1580 fabs(word0->yMin - word2->yMin) < secDelta &&
1581 fabs(word0->yMax - word2->yMax) < secDelta;
1582 break;
1583 case 1:
1584 case 3:
1585 found = fabs(word0->xMin - word2->xMin) < secDelta &&
1586 fabs(word0->xMax - word2->xMax) < secDelta &&
1587 fabs(word0->yMin - word2->yMin) < priDelta &&
1588 fabs(word0->yMax - word2->yMax) < priDelta;
1589 break;
1592 if (found) {
1593 break;
1596 if (found) {
1597 break;
1600 if (found) {
1601 if (word1) {
1602 word1->next = word2->next;
1603 } else {
1604 pool->setPool(idx1, word2->next);
1606 delete word2;
1607 } else {
1608 word0 = word0->next;
1613 // build the lines
1614 curLine = NULL;
1615 poolMinBaseIdx = pool->minBaseIdx;
1616 charCount = 0;
1617 nLines = 0;
1618 while (1) {
1620 // find the first non-empty line in the pool
1621 for (;
1622 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1623 ++poolMinBaseIdx) ;
1624 if (poolMinBaseIdx > pool->maxBaseIdx) {
1625 break;
1628 // look for the left-most word in the first four lines of the
1629 // pool -- this avoids starting with a superscript word
1630 startBaseIdx = poolMinBaseIdx;
1631 for (baseIdx = poolMinBaseIdx + 1;
1632 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1633 ++baseIdx) {
1634 if (!pool->getPool(baseIdx)) {
1635 continue;
1637 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1638 < 0) {
1639 startBaseIdx = baseIdx;
1643 // create a new line
1644 word0 = pool->getPool(startBaseIdx);
1645 pool->setPool(startBaseIdx, word0->next);
1646 word0->next = NULL;
1647 line = new TextLine(this, word0->rot, word0->base);
1648 line->addWord(word0);
1649 lastWord = word0;
1651 // compute the search range
1652 fontSize = word0->fontSize;
1653 minBase = word0->base - maxIntraLineDelta * fontSize;
1654 maxBase = word0->base + maxIntraLineDelta * fontSize;
1655 minBaseIdx = pool->getBaseIdx(minBase);
1656 maxBaseIdx = pool->getBaseIdx(maxBase);
1657 wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
1659 // find the rest of the words in this line
1660 while (1) {
1662 // find the left-most word whose baseline is in the range for
1663 // this line
1664 bestWordBaseIdx = 0;
1665 bestWord0 = bestWord1 = NULL;
1666 overlap = gFalse;
1667 for (baseIdx = minBaseIdx;
1668 !overlap && baseIdx <= maxBaseIdx;
1669 ++baseIdx) {
1670 for (word0 = NULL, word1 = pool->getPool(baseIdx);
1671 word1;
1672 word0 = word1, word1 = word1->next) {
1673 if (word1->base >= minBase &&
1674 word1->base <= maxBase) {
1675 delta = lastWord->primaryDelta(word1);
1676 if (delta < minCharSpacing * fontSize) {
1677 overlap = gTrue;
1678 break;
1679 } else {
1680 if (delta < wordSpacing &&
1681 (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1682 bestWordBaseIdx = baseIdx;
1683 bestWord0 = word0;
1684 bestWord1 = word1;
1686 break;
1691 if (overlap || !bestWord1) {
1692 break;
1695 // remove it from the pool, and add it to the line
1696 if (bestWord0) {
1697 bestWord0->next = bestWord1->next;
1698 } else {
1699 pool->setPool(bestWordBaseIdx, bestWord1->next);
1701 bestWord1->next = NULL;
1702 line->addWord(bestWord1);
1703 lastWord = bestWord1;
1706 // add the line
1707 if (curLine && line->cmpYX(curLine) > 0) {
1708 line0 = curLine;
1709 line1 = curLine->next;
1710 } else {
1711 line0 = NULL;
1712 line1 = lines;
1714 for (;
1715 line1 && line->cmpYX(line1) > 0;
1716 line0 = line1, line1 = line1->next) ;
1717 if (line0) {
1718 line0->next = line;
1719 } else {
1720 lines = line;
1722 line->next = line1;
1723 curLine = line;
1724 line->coalesce(uMap);
1725 charCount += line->len;
1726 ++nLines;
1729 // sort lines into xy order for column assignment
1730 lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1731 for (line = lines, i = 0; line; line = line->next, ++i) {
1732 lineArray[i] = line;
1734 qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1736 // column assignment
1737 nColumns = 0;
1738 if (fixedPitch) {
1739 for (i = 0; i < nLines; ++i) {
1740 line0 = lineArray[i];
1741 col1 = 0; // make gcc happy
1742 switch (rot) {
1743 case 0:
1744 col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
1745 break;
1746 case 1:
1747 col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
1748 break;
1749 case 2:
1750 col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
1751 break;
1752 case 3:
1753 col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
1754 break;
1756 for (k = 0; k <= line0->len; ++k) {
1757 line0->col[k] += col1;
1759 if (line0->col[line0->len] > nColumns) {
1760 nColumns = line0->col[line0->len];
1763 } else {
1764 for (i = 0; i < nLines; ++i) {
1765 line0 = lineArray[i];
1766 col1 = 0;
1767 for (j = 0; j < i; ++j) {
1768 line1 = lineArray[j];
1769 if (line1->primaryDelta(line0) >= 0) {
1770 col2 = line1->col[line1->len] + 1;
1771 } else {
1772 k = 0; // make gcc happy
1773 switch (rot) {
1774 case 0:
1775 for (k = 0;
1776 k < line1->len &&
1777 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1778 ++k) ;
1779 break;
1780 case 1:
1781 for (k = 0;
1782 k < line1->len &&
1783 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1784 ++k) ;
1785 break;
1786 case 2:
1787 for (k = 0;
1788 k < line1->len &&
1789 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1790 ++k) ;
1791 break;
1792 case 3:
1793 for (k = 0;
1794 k < line1->len &&
1795 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1796 ++k) ;
1797 break;
1799 col2 = line1->col[k];
1801 if (col2 > col1) {
1802 col1 = col2;
1805 for (k = 0; k <= line0->len; ++k) {
1806 line0->col[k] += col1;
1808 if (line0->col[line0->len] > nColumns) {
1809 nColumns = line0->col[line0->len];
1813 gfree(lineArray);
1816 void TextBlock::updatePriMinMax(TextBlock *blk) {
1817 double newPriMin, newPriMax;
1818 GBool gotPriMin, gotPriMax;
1820 gotPriMin = gotPriMax = gFalse;
1821 newPriMin = newPriMax = 0; // make gcc happy
1822 switch (page->primaryRot) {
1823 case 0:
1824 case 2:
1825 if (blk->yMin < yMax && blk->yMax > yMin) {
1826 if (blk->xMin < xMin) {
1827 newPriMin = blk->xMax;
1828 gotPriMin = gTrue;
1830 if (blk->xMax > xMax) {
1831 newPriMax = blk->xMin;
1832 gotPriMax = gTrue;
1835 break;
1836 case 1:
1837 case 3:
1838 if (blk->xMin < xMax && blk->xMax > xMin) {
1839 if (blk->yMin < yMin) {
1840 newPriMin = blk->yMax;
1841 gotPriMin = gTrue;
1843 if (blk->yMax > yMax) {
1844 newPriMax = blk->yMin;
1845 gotPriMax = gTrue;
1848 break;
1850 if (gotPriMin) {
1851 if (newPriMin > xMin) {
1852 newPriMin = xMin;
1854 if (newPriMin > priMin) {
1855 priMin = newPriMin;
1858 if (gotPriMax) {
1859 if (newPriMax < xMax) {
1860 newPriMax = xMax;
1862 if (newPriMax < priMax) {
1863 priMax = newPriMax;
1868 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1869 TextBlock *blk1 = *(TextBlock **)p1;
1870 TextBlock *blk2 = *(TextBlock **)p2;
1871 double cmp;
1873 cmp = 0; // make gcc happy
1874 switch (blk1->page->primaryRot) {
1875 case 0:
1876 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1877 cmp = blk1->yMin - blk2->yMin;
1879 break;
1880 case 1:
1881 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1882 cmp = blk2->xMax - blk1->xMax;
1884 break;
1885 case 2:
1886 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1887 cmp = blk2->yMin - blk1->yMin;
1889 break;
1890 case 3:
1891 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1892 cmp = blk1->xMax - blk2->xMax;
1894 break;
1896 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1899 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1900 TextBlock *blk1 = *(TextBlock **)p1;
1901 TextBlock *blk2 = *(TextBlock **)p2;
1902 double cmp;
1904 cmp = 0; // make gcc happy
1905 switch (blk1->page->primaryRot) {
1906 case 0:
1907 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1908 cmp = blk1->xMin - blk2->xMin;
1910 break;
1911 case 1:
1912 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1913 cmp = blk1->yMin - blk2->yMin;
1915 break;
1916 case 2:
1917 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1918 cmp = blk2->xMax - blk1->xMax;
1920 break;
1921 case 3:
1922 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1923 cmp = blk2->yMax - blk1->yMax;
1925 break;
1927 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1930 int TextBlock::primaryCmp(TextBlock *blk) {
1931 double cmp;
1933 cmp = 0; // make gcc happy
1934 switch (rot) {
1935 case 0:
1936 cmp = xMin - blk->xMin;
1937 break;
1938 case 1:
1939 cmp = yMin - blk->yMin;
1940 break;
1941 case 2:
1942 cmp = blk->xMax - xMax;
1943 break;
1944 case 3:
1945 cmp = blk->yMax - yMax;
1946 break;
1948 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1951 double TextBlock::secondaryDelta(TextBlock *blk) {
1952 double delta;
1954 delta = 0; // make gcc happy
1955 switch (rot) {
1956 case 0:
1957 delta = blk->yMin - yMax;
1958 break;
1959 case 1:
1960 delta = xMin - blk->xMax;
1961 break;
1962 case 2:
1963 delta = yMin - blk->yMax;
1964 break;
1965 case 3:
1966 delta = blk->xMin - xMax;
1967 break;
1969 return delta;
1972 GBool TextBlock::isBelow(TextBlock *blk) {
1973 GBool below;
1975 below = gFalse; // make gcc happy
1976 switch (page->primaryRot) {
1977 case 0:
1978 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1979 yMin > blk->yMin;
1980 break;
1981 case 1:
1982 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1983 xMax < blk->xMax;
1984 break;
1985 case 2:
1986 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1987 yMax < blk->yMax;
1988 break;
1989 case 3:
1990 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1991 xMin > blk->xMin;
1992 break;
1995 return below;
1998 GBool TextBlock::isBeforeByRule1(TextBlock *blk1) {
1999 GBool before = gFalse;
2000 GBool overlap = gFalse;
2002 switch (this->page->primaryRot) {
2003 case 0:
2004 case 2:
2005 overlap = ((this->ExMin <= blk1->ExMin) &&
2006 (blk1->ExMin <= this->ExMax)) ||
2007 ((blk1->ExMin <= this->ExMin) &&
2008 (this->ExMin <= blk1->ExMax));
2009 break;
2010 case 1:
2011 case 3:
2012 overlap = ((this->EyMin <= blk1->EyMin) &&
2013 (blk1->EyMin <= this->EyMax)) ||
2014 ((blk1->EyMin <= this->EyMin) &&
2015 (this->EyMin <= blk1->EyMax));
2016 break;
2018 switch (this->page->primaryRot) {
2019 case 0:
2020 before = overlap && this->EyMin < blk1->EyMin;
2021 break;
2022 case 1:
2023 before = overlap && this->ExMax > blk1->ExMax;
2024 break;
2025 case 2:
2026 before = overlap && this->EyMax > blk1->EyMax;
2027 break;
2028 case 3:
2029 before = overlap && this->ExMin < blk1->ExMin;
2030 break;
2032 return before;
2035 GBool TextBlock::isBeforeByRule2(TextBlock *blk1) {
2036 double cmp = 0;
2037 int rotLR = rot;
2039 if (!page->primaryLR) {
2040 rotLR = (rotLR + 2) % 4;
2043 switch (rotLR) {
2044 case 0:
2045 cmp = ExMax - blk1->ExMin;
2046 break;
2047 case 1:
2048 cmp = EyMin - blk1->EyMax;
2049 break;
2050 case 2:
2051 cmp = blk1->ExMax - ExMin;
2052 break;
2053 case 3:
2054 cmp = blk1->EyMin - EyMax;
2055 break;
2057 return cmp <= 0;
2060 // Sort into reading order by performing a topological sort using the rules
2061 // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2062 // See http://pubs.iupr.org/#2003-breuel-sdiut
2063 // Topological sort is done by depth first search, see
2064 // http://en.wikipedia.org/wiki/Topological_sorting
2065 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1,
2066 TextBlock **sorted, int sortPos,
2067 GBool* visited) {
2068 int pos2;
2069 TextBlock *blk1, *blk2, *blk3;
2070 GBool before;
2072 if (visited[pos1]) {
2073 return sortPos;
2076 blk1 = this;
2078 #if 0 // for debugging
2079 printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2080 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2081 #endif
2082 visited[pos1] = gTrue;
2083 pos2 = -1;
2084 for (blk2 = blkList; blk2; blk2 = blk2->next) {
2085 pos2++;
2086 if (visited[pos2]) {
2087 // skip visited nodes
2088 continue;
2090 before = gFalse;
2092 // is blk2 before blk1? (for table entries)
2093 if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
2094 if (page->primaryLR) {
2095 if (blk2->xMax <= blk1->xMin &&
2096 blk2->yMin <= blk1->yMax &&
2097 blk2->yMax >= blk1->yMin)
2098 before = gTrue;
2099 } else {
2100 if (blk2->xMin >= blk1->xMax &&
2101 blk2->yMin <= blk1->yMax &&
2102 blk2->yMax >= blk1->yMin)
2103 before = gTrue;
2106 if (blk2->yMax <= blk1->yMin)
2107 before = gTrue;
2108 } else {
2109 if (blk2->isBeforeByRule1(blk1)) {
2110 // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2111 before = gTrue;
2112 #if 0 // for debugging
2113 printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2114 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
2115 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2116 #endif
2117 } else if (blk2->isBeforeByRule2(blk1)) {
2118 // Rule (2) blk2 left of blk1, and no intervening blk3
2119 // such that blk1 is before blk3 by rule 1,
2120 // and blk3 is before blk2 by rule 1.
2121 before = gTrue;
2122 for (blk3 = blkList; blk3; blk3 = blk3->next) {
2123 if (blk3 == blk2 || blk3 == blk1) {
2124 continue;
2126 if (blk1->isBeforeByRule1(blk3) &&
2127 blk3->isBeforeByRule1(blk2)) {
2128 before = gFalse;
2129 break;
2132 #if 0 // for debugging
2133 if (before) {
2134 printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2135 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
2136 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
2138 #endif
2141 if (before) {
2142 // blk2 is before blk1, so it needs to be visited
2143 // before we can add blk1 to the sorted list.
2144 sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited);
2147 #if 0 // for debugging
2148 printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2149 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2150 #endif
2151 sorted[sortPos++] = blk1;
2152 return sortPos;
2155 //------------------------------------------------------------------------
2156 // TextFlow
2157 //------------------------------------------------------------------------
2159 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
2160 page = pageA;
2161 xMin = blk->xMin;
2162 xMax = blk->xMax;
2163 yMin = blk->yMin;
2164 yMax = blk->yMax;
2165 priMin = blk->priMin;
2166 priMax = blk->priMax;
2167 blocks = lastBlk = blk;
2168 next = NULL;
2171 TextFlow::~TextFlow() {
2172 TextBlock *blk;
2174 while (blocks) {
2175 blk = blocks;
2176 blocks = blocks->next;
2177 delete blk;
2181 void TextFlow::addBlock(TextBlock *blk) {
2182 if (lastBlk) {
2183 lastBlk->next = blk;
2184 } else {
2185 blocks = blk;
2187 lastBlk = blk;
2188 if (blk->xMin < xMin) {
2189 xMin = blk->xMin;
2191 if (blk->xMax > xMax) {
2192 xMax = blk->xMax;
2194 if (blk->yMin < yMin) {
2195 yMin = blk->yMin;
2197 if (blk->yMax > yMax) {
2198 yMax = blk->yMax;
2202 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
2203 GBool fits;
2205 // lower blocks must use smaller fonts
2206 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
2207 return gFalse;
2210 fits = gFalse; // make gcc happy
2211 switch (page->primaryRot) {
2212 case 0:
2213 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2214 break;
2215 case 1:
2216 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2217 break;
2218 case 2:
2219 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2220 break;
2221 case 3:
2222 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2223 break;
2225 return fits;
2228 #if TEXTOUT_WORD_LIST
2230 //------------------------------------------------------------------------
2231 // TextWordList
2232 //------------------------------------------------------------------------
2234 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
2235 TextFlow *flow;
2236 TextBlock *blk;
2237 TextLine *line;
2238 TextWord *word;
2239 TextWord **wordArray;
2240 int nWords, i;
2242 words = new GooList();
2244 if (text->rawOrder) {
2245 for (word = text->rawWords; word; word = word->next) {
2246 words->append(word);
2249 } else if (physLayout) {
2250 // this is inefficient, but it's also the least useful of these
2251 // three cases
2252 nWords = 0;
2253 for (flow = text->flows; flow; flow = flow->next) {
2254 for (blk = flow->blocks; blk; blk = blk->next) {
2255 for (line = blk->lines; line; line = line->next) {
2256 for (word = line->words; word; word = word->next) {
2257 ++nWords;
2262 wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
2263 i = 0;
2264 for (flow = text->flows; flow; flow = flow->next) {
2265 for (blk = flow->blocks; blk; blk = blk->next) {
2266 for (line = blk->lines; line; line = line->next) {
2267 for (word = line->words; word; word = word->next) {
2268 wordArray[i++] = word;
2273 qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
2274 for (i = 0; i < nWords; ++i) {
2275 words->append(wordArray[i]);
2277 gfree(wordArray);
2279 } else {
2280 for (flow = text->flows; flow; flow = flow->next) {
2281 for (blk = flow->blocks; blk; blk = blk->next) {
2282 for (line = blk->lines; line; line = line->next) {
2283 for (word = line->words; word; word = word->next) {
2284 words->append(word);
2292 TextWordList::~TextWordList() {
2293 delete words;
2296 int TextWordList::getLength() {
2297 return words->getLength();
2300 TextWord *TextWordList::get(int idx) {
2301 if (idx < 0 || idx >= words->getLength()) {
2302 return NULL;
2304 return (TextWord *)words->get(idx);
2307 #endif // TEXTOUT_WORD_LIST
2309 //------------------------------------------------------------------------
2310 // TextPage
2311 //------------------------------------------------------------------------
2313 TextPage::TextPage(GBool rawOrderA) {
2314 int rot;
2316 refCnt = 1;
2317 rawOrder = rawOrderA;
2318 curWord = NULL;
2319 charPos = 0;
2320 curFont = NULL;
2321 curFontSize = 0;
2322 nest = 0;
2323 nTinyChars = 0;
2324 lastCharOverlap = gFalse;
2325 if (!rawOrder) {
2326 for (rot = 0; rot < 4; ++rot) {
2327 pools[rot] = new TextPool();
2330 flows = NULL;
2331 blocks = NULL;
2332 rawWords = NULL;
2333 rawLastWord = NULL;
2334 fonts = new GooList();
2335 lastFindXMin = lastFindYMin = 0;
2336 haveLastFind = gFalse;
2337 underlines = new GooList();
2338 links = new GooList();
2339 mergeCombining = gTrue;
2342 TextPage::~TextPage() {
2343 int rot;
2345 clear();
2346 if (!rawOrder) {
2347 for (rot = 0; rot < 4; ++rot) {
2348 delete pools[rot];
2351 delete fonts;
2352 deleteGooList(underlines, TextUnderline);
2353 deleteGooList(links, TextLink);
2356 void TextPage::incRefCnt() {
2357 refCnt++;
2360 void TextPage::decRefCnt() {
2361 if (--refCnt == 0)
2362 delete this;
2365 void TextPage::startPage(GfxState *state) {
2366 clear();
2367 if (state) {
2368 pageWidth = state->getPageWidth();
2369 pageHeight = state->getPageHeight();
2370 } else {
2371 pageWidth = pageHeight = 0;
2375 void TextPage::endPage() {
2376 if (curWord) {
2377 endWord();
2381 void TextPage::clear() {
2382 int rot;
2383 TextFlow *flow;
2384 TextWord *word;
2386 if (curWord) {
2387 delete curWord;
2388 curWord = NULL;
2390 if (rawOrder) {
2391 while (rawWords) {
2392 word = rawWords;
2393 rawWords = rawWords->next;
2394 delete word;
2396 } else {
2397 for (rot = 0; rot < 4; ++rot) {
2398 delete pools[rot];
2400 while (flows) {
2401 flow = flows;
2402 flows = flows->next;
2403 delete flow;
2405 gfree(blocks);
2407 deleteGooList(fonts, TextFontInfo);
2408 deleteGooList(underlines, TextUnderline);
2409 deleteGooList(links, TextLink);
2411 curWord = NULL;
2412 charPos = 0;
2413 curFont = NULL;
2414 curFontSize = 0;
2415 nest = 0;
2416 nTinyChars = 0;
2417 if (!rawOrder) {
2418 for (rot = 0; rot < 4; ++rot) {
2419 pools[rot] = new TextPool();
2422 flows = NULL;
2423 blocks = NULL;
2424 rawWords = NULL;
2425 rawLastWord = NULL;
2426 fonts = new GooList();
2427 underlines = new GooList();
2428 links = new GooList();
2431 void TextPage::updateFont(GfxState *state) {
2432 GfxFont *gfxFont;
2433 double *fm;
2434 char *name;
2435 int code, mCode, letterCode, anyCode;
2436 double w;
2437 int i;
2439 // get the font info object
2440 curFont = NULL;
2441 for (i = 0; i < fonts->getLength(); ++i) {
2442 curFont = (TextFontInfo *)fonts->get(i);
2443 if (curFont->matches(state)) {
2444 break;
2446 curFont = NULL;
2448 if (!curFont) {
2449 curFont = new TextFontInfo(state);
2450 fonts->append(curFont);
2453 // adjust the font size
2454 gfxFont = state->getFont();
2455 curFontSize = state->getTransformedFontSize();
2456 if (gfxFont && gfxFont->getType() == fontType3) {
2457 // This is a hack which makes it possible to deal with some Type 3
2458 // fonts. The problem is that it's impossible to know what the
2459 // base coordinate system used in the font is without actually
2460 // rendering the font. This code tries to guess by looking at the
2461 // width of the character 'm' (which breaks if the font is a
2462 // subset that doesn't contain 'm').
2463 mCode = letterCode = anyCode = -1;
2464 for (code = 0; code < 256; ++code) {
2465 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2466 int nameLen = name ? strlen(name) : 0;
2467 GBool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0');
2468 if (nameOneChar && name[0] == 'm') {
2469 mCode = code;
2471 if (letterCode < 0 && nameOneChar &&
2472 ((name[0] >= 'A' && name[0] <= 'Z') ||
2473 (name[0] >= 'a' && name[0] <= 'z'))) {
2474 letterCode = code;
2476 if (anyCode < 0 && name &&
2477 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
2478 anyCode = code;
2481 if (mCode >= 0 &&
2482 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
2483 // 0.6 is a generic average 'm' width -- yes, this is a hack
2484 curFontSize *= w / 0.6;
2485 } else if (letterCode >= 0 &&
2486 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
2487 // even more of a hack: 0.5 is a generic letter width
2488 curFontSize *= w / 0.5;
2489 } else if (anyCode >= 0 &&
2490 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
2491 // better than nothing: 0.5 is a generic character width
2492 curFontSize *= w / 0.5;
2494 fm = gfxFont->getFontMatrix();
2495 if (fm[0] != 0) {
2496 curFontSize *= fabs(fm[3] / fm[0]);
2501 void TextPage::beginWord(GfxState *state) {
2502 GfxFont *gfxFont;
2503 double *fontm;
2504 double m[4], m2[4];
2505 int rot;
2507 // This check is needed because Type 3 characters can contain
2508 // text-drawing operations (when TextPage is being used via
2509 // {X,Win}SplashOutputDev rather than TextOutputDev).
2510 if (curWord) {
2511 ++nest;
2512 return;
2515 // compute the rotation
2516 state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
2517 gfxFont = state->getFont();
2518 if (gfxFont && gfxFont->getType() == fontType3) {
2519 fontm = state->getFont()->getFontMatrix();
2520 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2521 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2522 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2523 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2524 m[0] = m2[0];
2525 m[1] = m2[1];
2526 m[2] = m2[2];
2527 m[3] = m2[3];
2529 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
2530 rot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
2531 } else {
2532 rot = (m[2] > 0) ? 1 : 3;
2535 // for vertical writing mode, the lines are effectively rotated 90
2536 // degrees
2537 if (gfxFont && gfxFont->getWMode()) {
2538 rot = (rot + 1) & 3;
2541 curWord = new TextWord(state, rot, curFontSize);
2544 void TextPage::addChar(GfxState *state, double x, double y,
2545 double dx, double dy,
2546 CharCode c, int nBytes, Unicode *u, int uLen) {
2547 double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2548 GBool overlap;
2549 int i;
2550 int wMode;
2551 Matrix mat;
2553 // subtract char and word spacing from the dx,dy values
2554 sp = state->getCharSpace();
2555 if (c == (CharCode)0x20) {
2556 sp += state->getWordSpace();
2558 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
2559 dx -= dx2;
2560 dy -= dy2;
2561 state->transformDelta(dx, dy, &w1, &h1);
2563 // throw away chars that aren't inside the page bounds
2564 // (and also do a sanity check on the character size)
2565 state->transform(x, y, &x1, &y1);
2566 if (x1 + w1 < 0 || x1 > pageWidth ||
2567 y1 + h1 < 0 || y1 > pageHeight ||
2568 x1 != x1 || y1 != y1 || // IEEE way of checking for isnan
2569 w1 != w1 || h1 != h1) {
2570 charPos += nBytes;
2571 return;
2574 // check the tiny chars limit
2575 if (!globalParams->getTextKeepTinyChars() &&
2576 fabs(w1) < 3 && fabs(h1) < 3) {
2577 if (++nTinyChars > 50000) {
2578 charPos += nBytes;
2579 return;
2583 // break words at space character
2584 if (uLen == 1 && u[0] == (Unicode)0x20) {
2585 charPos += nBytes;
2586 endWord();
2587 return;
2590 state->getFontTransMat(&mat.m[0], &mat.m[1], &mat.m[2], &mat.m[3]);
2591 mat.m[4] = x1;
2592 mat.m[5] = y1;
2594 if (mergeCombining && curWord && uLen == 1 &&
2595 curWord->addCombining(state, curFont, curFontSize, x1, y1, w1, h1, charPos, nBytes, c,
2596 u[0], mat)) {
2597 charPos += nBytes;
2598 return;
2601 // start a new word if:
2602 // (1) this character doesn't fall in the right place relative to
2603 // the end of the previous word (this places upper and lower
2604 // constraints on the position deltas along both the primary
2605 // and secondary axes), or
2606 // (2) this character overlaps the previous one (duplicated text), or
2607 // (3) the previous character was an overlap (we want each duplicated
2608 // character to be in a word by itself at this stage),
2609 // (4) the font size has changed
2610 // (5) the WMode changed
2611 if (curWord && curWord->len > 0) {
2612 base = sp = delta = 0; // make gcc happy
2613 switch (curWord->rot) {
2614 case 0:
2615 base = y1;
2616 sp = x1 - curWord->xMax;
2617 delta = x1 - curWord->edge[curWord->len - 1];
2618 break;
2619 case 1:
2620 base = x1;
2621 sp = y1 - curWord->yMax;
2622 delta = y1 - curWord->edge[curWord->len - 1];
2623 break;
2624 case 2:
2625 base = y1;
2626 sp = curWord->xMin - x1;
2627 delta = curWord->edge[curWord->len - 1] - x1;
2628 break;
2629 case 3:
2630 base = x1;
2631 sp = curWord->yMin - y1;
2632 delta = curWord->edge[curWord->len - 1] - y1;
2633 break;
2635 overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
2636 fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2637 wMode = curFont->getWMode();
2638 if (overlap || lastCharOverlap ||
2639 sp < -minDupBreakOverlap * curWord->fontSize ||
2640 sp > minWordBreakSpace * curWord->fontSize ||
2641 fabs(base - curWord->base) > 0.5 ||
2642 curFontSize != curWord->fontSize ||
2643 wMode != curWord->wMode) {
2644 endWord();
2646 lastCharOverlap = overlap;
2647 } else {
2648 lastCharOverlap = gFalse;
2651 if (uLen != 0) {
2652 // start a new word if needed
2653 if (!curWord) {
2654 beginWord(state);
2657 // page rotation and/or transform matrices can cause text to be
2658 // drawn in reverse order -- in this case, swap the begin/end
2659 // coordinates and break text into individual chars
2660 if ((curWord->rot == 0 && w1 < 0) ||
2661 (curWord->rot == 1 && h1 < 0) ||
2662 (curWord->rot == 2 && w1 > 0) ||
2663 (curWord->rot == 3 && h1 > 0)) {
2664 endWord();
2665 beginWord(state);
2666 x1 += w1;
2667 y1 += h1;
2668 w1 = -w1;
2669 h1 = -h1;
2672 // add the characters to the current word
2673 w1 /= uLen;
2674 h1 /= uLen;
2675 for (i = 0; i < uLen; ++i) {
2676 curWord->addChar(state, curFont, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i], mat);
2679 charPos += nBytes;
2682 void TextPage::incCharCount(int nChars) {
2683 charPos += nChars;
2686 void TextPage::endWord() {
2687 // This check is needed because Type 3 characters can contain
2688 // text-drawing operations (when TextPage is being used via
2689 // {X,Win}SplashOutputDev rather than TextOutputDev).
2690 if (nest > 0) {
2691 --nest;
2692 return;
2695 if (curWord) {
2696 addWord(curWord);
2697 curWord = NULL;
2701 void TextPage::addWord(TextWord *word) {
2702 // throw away zero-length words -- they don't have valid xMin/xMax
2703 // values, and they're useless anyway
2704 if (word->len == 0) {
2705 delete word;
2706 return;
2709 if (rawOrder) {
2710 if (rawLastWord) {
2711 rawLastWord->next = word;
2712 } else {
2713 rawWords = word;
2715 rawLastWord = word;
2716 } else {
2717 pools[word->rot]->addWord(word);
2721 void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
2722 underlines->append(new TextUnderline(x0, y0, x1, y1));
2725 void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link) {
2726 links->append(new TextLink(xMin, yMin, xMax, yMax, link));
2729 void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) {
2730 UnicodeMap *uMap;
2731 TextPool *pool;
2732 TextWord *word0, *word1, *word2;
2733 TextLine *line;
2734 TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2735 TextFlow *flow, *lastFlow;
2736 TextUnderline *underline;
2737 TextLink *link;
2738 int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2739 double minBase, maxBase, newMinBase, newMaxBase;
2740 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2741 GBool found;
2742 int count[4];
2743 int lrCount;
2744 int col1, col2;
2745 int i, j, n;
2747 if (rawOrder) {
2748 primaryRot = 0;
2749 primaryLR = gTrue;
2750 return;
2753 uMap = globalParams->getTextEncoding();
2754 blkList = NULL;
2755 lastBlk = NULL;
2756 nBlocks = 0;
2757 primaryRot = 0;
2759 #if 0 // for debugging
2760 printf("*** initial words ***\n");
2761 for (rot = 0; rot < 4; ++rot) {
2762 pool = pools[rot];
2763 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2764 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2765 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2766 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2767 word0->base, word0->fontSize, rot*90, word0->link);
2768 for (i = 0; i < word0->len; ++i) {
2769 fputc(word0->text[i] & 0xff, stdout);
2771 printf("'\n");
2775 printf("\n");
2776 #endif
2778 #if 0 //~ for debugging
2779 for (i = 0; i < underlines->getLength(); ++i) {
2780 underline = (TextUnderline *)underlines->get(i);
2781 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2782 underline->x0, underline->x1, underline->y0, underline->y1,
2783 underline->horiz);
2785 #endif
2787 if (doHTML) {
2789 //----- handle underlining
2790 for (i = 0; i < underlines->getLength(); ++i) {
2791 underline = (TextUnderline *)underlines->get(i);
2792 if (underline->horiz) {
2793 // rot = 0
2794 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2795 startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2796 endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2797 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2798 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2799 //~ need to check the y value against the word baseline
2800 if (underline->x0 < word0->xMin + underlineSlack &&
2801 word0->xMax - underlineSlack < underline->x1) {
2802 word0->underlined = gTrue;
2808 // rot = 2
2809 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2810 startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2811 endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2812 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2813 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2814 if (underline->x0 < word0->xMin + underlineSlack &&
2815 word0->xMax - underlineSlack < underline->x1) {
2816 word0->underlined = gTrue;
2821 } else {
2822 // rot = 1
2823 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2824 startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2825 endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2826 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2827 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2828 if (underline->y0 < word0->yMin + underlineSlack &&
2829 word0->yMax - underlineSlack < underline->y1) {
2830 word0->underlined = gTrue;
2836 // rot = 3
2837 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2838 startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2839 endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2840 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2841 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2842 if (underline->y0 < word0->yMin + underlineSlack &&
2843 word0->yMax - underlineSlack < underline->y1) {
2844 word0->underlined = gTrue;
2852 //----- handle links
2853 for (i = 0; i < links->getLength(); ++i) {
2854 link = (TextLink *)links->get(i);
2856 // rot = 0
2857 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2858 startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2859 endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2860 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2861 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2862 if (link->xMin < word0->xMin + hyperlinkSlack &&
2863 word0->xMax - hyperlinkSlack < link->xMax &&
2864 link->yMin < word0->yMin + hyperlinkSlack &&
2865 word0->yMax - hyperlinkSlack < link->yMax) {
2866 word0->link = link->link;
2872 // rot = 2
2873 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2874 startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2875 endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2876 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2877 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2878 if (link->xMin < word0->xMin + hyperlinkSlack &&
2879 word0->xMax - hyperlinkSlack < link->xMax &&
2880 link->yMin < word0->yMin + hyperlinkSlack &&
2881 word0->yMax - hyperlinkSlack < link->yMax) {
2882 word0->link = link->link;
2888 // rot = 1
2889 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2890 startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2891 endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2892 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2893 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2894 if (link->yMin < word0->yMin + hyperlinkSlack &&
2895 word0->yMax - hyperlinkSlack < link->yMax &&
2896 link->xMin < word0->xMin + hyperlinkSlack &&
2897 word0->xMax - hyperlinkSlack < link->xMax) {
2898 word0->link = link->link;
2904 // rot = 3
2905 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2906 startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2907 endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2908 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2909 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2910 if (link->yMin < word0->yMin + hyperlinkSlack &&
2911 word0->yMax - hyperlinkSlack < link->yMax &&
2912 link->xMin < word0->xMin + hyperlinkSlack &&
2913 word0->xMax - hyperlinkSlack < link->xMax) {
2914 word0->link = link->link;
2922 //----- assemble the blocks
2924 //~ add an outer loop for writing mode (vertical text)
2926 // build blocks for each rotation value
2927 for (rot = 0; rot < 4; ++rot) {
2928 pool = pools[rot];
2929 poolMinBaseIdx = pool->minBaseIdx;
2930 count[rot] = 0;
2932 // add blocks until no more words are left
2933 while (1) {
2935 // find the first non-empty line in the pool
2936 for (;
2937 poolMinBaseIdx <= pool->maxBaseIdx &&
2938 !pool->getPool(poolMinBaseIdx);
2939 ++poolMinBaseIdx) ;
2940 if (poolMinBaseIdx > pool->maxBaseIdx) {
2941 break;
2944 // look for the left-most word in the first four lines of the
2945 // pool -- this avoids starting with a superscript word
2946 startBaseIdx = poolMinBaseIdx;
2947 for (baseIdx = poolMinBaseIdx + 1;
2948 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2949 ++baseIdx) {
2950 if (!pool->getPool(baseIdx)) {
2951 continue;
2953 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2954 < 0) {
2955 startBaseIdx = baseIdx;
2959 // create a new block
2960 word0 = pool->getPool(startBaseIdx);
2961 pool->setPool(startBaseIdx, word0->next);
2962 word0->next = NULL;
2963 blk = new TextBlock(this, rot);
2964 blk->addWord(word0);
2966 fontSize = word0->fontSize;
2967 minBase = maxBase = word0->base;
2968 colSpace1 = minColSpacing1 * fontSize;
2969 colSpace2 = minColSpacing2 * fontSize;
2970 lineSpace = maxLineSpacingDelta * fontSize;
2971 intraLineSpace = maxIntraLineDelta * fontSize;
2973 // add words to the block
2974 do {
2975 found = gFalse;
2977 // look for words on the line above the current top edge of
2978 // the block
2979 newMinBase = minBase;
2980 for (baseIdx = pool->getBaseIdx(minBase);
2981 baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2982 --baseIdx) {
2983 word0 = NULL;
2984 word1 = pool->getPool(baseIdx);
2985 while (word1) {
2986 if (word1->base < minBase &&
2987 word1->base >= minBase - lineSpace &&
2988 ((rot == 0 || rot == 2)
2989 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2990 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2991 fabs(word1->fontSize - fontSize) <
2992 maxBlockFontSizeDelta1 * fontSize) {
2993 word2 = word1;
2994 if (word0) {
2995 word0->next = word1->next;
2996 } else {
2997 pool->setPool(baseIdx, word1->next);
2999 word1 = word1->next;
3000 word2->next = NULL;
3001 blk->addWord(word2);
3002 found = gTrue;
3003 newMinBase = word2->base;
3004 } else {
3005 word0 = word1;
3006 word1 = word1->next;
3010 minBase = newMinBase;
3012 // look for words on the line below the current bottom edge of
3013 // the block
3014 newMaxBase = maxBase;
3015 for (baseIdx = pool->getBaseIdx(maxBase);
3016 baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
3017 ++baseIdx) {
3018 word0 = NULL;
3019 word1 = pool->getPool(baseIdx);
3020 while (word1) {
3021 if (word1->base > maxBase &&
3022 word1->base <= maxBase + lineSpace &&
3023 ((rot == 0 || rot == 2)
3024 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
3025 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
3026 fabs(word1->fontSize - fontSize) <
3027 maxBlockFontSizeDelta1 * fontSize) {
3028 word2 = word1;
3029 if (word0) {
3030 word0->next = word1->next;
3031 } else {
3032 pool->setPool(baseIdx, word1->next);
3034 word1 = word1->next;
3035 word2->next = NULL;
3036 blk->addWord(word2);
3037 found = gTrue;
3038 newMaxBase = word2->base;
3039 } else {
3040 word0 = word1;
3041 word1 = word1->next;
3045 maxBase = newMaxBase;
3047 // look for words that are on lines already in the block, and
3048 // that overlap the block horizontally
3049 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
3050 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
3051 ++baseIdx) {
3052 word0 = NULL;
3053 word1 = pool->getPool(baseIdx);
3054 while (word1) {
3055 if (word1->base >= minBase - intraLineSpace &&
3056 word1->base <= maxBase + intraLineSpace &&
3057 ((rot == 0 || rot == 2)
3058 ? (word1->xMin < blk->xMax + colSpace1 &&
3059 word1->xMax > blk->xMin - colSpace1)
3060 : (word1->yMin < blk->yMax + colSpace1 &&
3061 word1->yMax > blk->yMin - colSpace1)) &&
3062 fabs(word1->fontSize - fontSize) <
3063 maxBlockFontSizeDelta2 * fontSize) {
3064 word2 = word1;
3065 if (word0) {
3066 word0->next = word1->next;
3067 } else {
3068 pool->setPool(baseIdx, word1->next);
3070 word1 = word1->next;
3071 word2->next = NULL;
3072 blk->addWord(word2);
3073 found = gTrue;
3074 } else {
3075 word0 = word1;
3076 word1 = word1->next;
3081 // only check for outlying words (the next two chunks of code)
3082 // if we didn't find anything else
3083 if (found) {
3084 continue;
3087 // scan down the left side of the block, looking for words
3088 // that are near (but not overlapping) the block; if there are
3089 // three or fewer, add them to the block
3090 n = 0;
3091 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
3092 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
3093 ++baseIdx) {
3094 word1 = pool->getPool(baseIdx);
3095 while (word1) {
3096 if (word1->base >= minBase - intraLineSpace &&
3097 word1->base <= maxBase + intraLineSpace &&
3098 ((rot == 0 || rot == 2)
3099 ? (word1->xMax <= blk->xMin &&
3100 word1->xMax > blk->xMin - colSpace2)
3101 : (word1->yMax <= blk->yMin &&
3102 word1->yMax > blk->yMin - colSpace2)) &&
3103 fabs(word1->fontSize - fontSize) <
3104 maxBlockFontSizeDelta3 * fontSize) {
3105 ++n;
3106 break;
3108 word1 = word1->next;
3111 if (n > 0 && n <= 3) {
3112 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
3113 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
3114 ++baseIdx) {
3115 word0 = NULL;
3116 word1 = pool->getPool(baseIdx);
3117 while (word1) {
3118 if (word1->base >= minBase - intraLineSpace &&
3119 word1->base <= maxBase + intraLineSpace &&
3120 ((rot == 0 || rot == 2)
3121 ? (word1->xMax <= blk->xMin &&
3122 word1->xMax > blk->xMin - colSpace2)
3123 : (word1->yMax <= blk->yMin &&
3124 word1->yMax > blk->yMin - colSpace2)) &&
3125 fabs(word1->fontSize - fontSize) <
3126 maxBlockFontSizeDelta3 * fontSize) {
3127 word2 = word1;
3128 if (word0) {
3129 word0->next = word1->next;
3130 } else {
3131 pool->setPool(baseIdx, word1->next);
3133 word1 = word1->next;
3134 word2->next = NULL;
3135 blk->addWord(word2);
3136 if (word2->base < minBase) {
3137 minBase = word2->base;
3138 } else if (word2->base > maxBase) {
3139 maxBase = word2->base;
3141 found = gTrue;
3142 break;
3143 } else {
3144 word0 = word1;
3145 word1 = word1->next;
3151 // scan down the right side of the block, looking for words
3152 // that are near (but not overlapping) the block; if there are
3153 // three or fewer, add them to the block
3154 n = 0;
3155 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
3156 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
3157 ++baseIdx) {
3158 word1 = pool->getPool(baseIdx);
3159 while (word1) {
3160 if (word1->base >= minBase - intraLineSpace &&
3161 word1->base <= maxBase + intraLineSpace &&
3162 ((rot == 0 || rot == 2)
3163 ? (word1->xMin >= blk->xMax &&
3164 word1->xMin < blk->xMax + colSpace2)
3165 : (word1->yMin >= blk->yMax &&
3166 word1->yMin < blk->yMax + colSpace2)) &&
3167 fabs(word1->fontSize - fontSize) <
3168 maxBlockFontSizeDelta3 * fontSize) {
3169 ++n;
3170 break;
3172 word1 = word1->next;
3175 if (n > 0 && n <= 3) {
3176 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
3177 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
3178 ++baseIdx) {
3179 word0 = NULL;
3180 word1 = pool->getPool(baseIdx);
3181 while (word1) {
3182 if (word1->base >= minBase - intraLineSpace &&
3183 word1->base <= maxBase + intraLineSpace &&
3184 ((rot == 0 || rot == 2)
3185 ? (word1->xMin >= blk->xMax &&
3186 word1->xMin < blk->xMax + colSpace2)
3187 : (word1->yMin >= blk->yMax &&
3188 word1->yMin < blk->yMax + colSpace2)) &&
3189 fabs(word1->fontSize - fontSize) <
3190 maxBlockFontSizeDelta3 * fontSize) {
3191 word2 = word1;
3192 if (word0) {
3193 word0->next = word1->next;
3194 } else {
3195 pool->setPool(baseIdx, word1->next);
3197 word1 = word1->next;
3198 word2->next = NULL;
3199 blk->addWord(word2);
3200 if (word2->base < minBase) {
3201 minBase = word2->base;
3202 } else if (word2->base > maxBase) {
3203 maxBase = word2->base;
3205 found = gTrue;
3206 break;
3207 } else {
3208 word0 = word1;
3209 word1 = word1->next;
3215 } while (found);
3217 //~ need to compute the primary writing mode (horiz/vert) in
3218 //~ addition to primary rotation
3220 // coalesce the block, and add it to the list
3221 blk->coalesce(uMap, fixedPitch);
3222 if (lastBlk) {
3223 lastBlk->next = blk;
3224 } else {
3225 blkList = blk;
3227 lastBlk = blk;
3228 count[rot] += blk->charCount;
3229 ++nBlocks;
3232 if (count[rot] > count[primaryRot]) {
3233 primaryRot = rot;
3237 #if 0 // for debugging
3238 printf("*** rotation ***\n");
3239 for (rot = 0; rot < 4; ++rot) {
3240 printf(" %d: %6d\n", rot, count[rot]);
3242 printf(" primary rot = %d\n", primaryRot);
3243 printf("\n");
3244 #endif
3246 #if 0 // for debugging
3247 printf("*** blocks ***\n");
3248 for (blk = blkList; blk; blk = blk->next) {
3249 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3250 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
3251 for (line = blk->lines; line; line = line->next) {
3252 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3253 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
3254 for (word0 = line->words; word0; word0 = word0->next) {
3255 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3256 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3257 word0->base, word0->fontSize, word0->spaceAfter);
3258 for (i = 0; i < word0->len; ++i) {
3259 fputc(word0->text[i] & 0xff, stdout);
3261 printf("'\n");
3265 printf("\n");
3266 #endif
3268 // determine the primary direction
3269 lrCount = 0;
3270 for (blk = blkList; blk; blk = blk->next) {
3271 for (line = blk->lines; line; line = line->next) {
3272 for (word0 = line->words; word0; word0 = word0->next) {
3273 for (i = 0; i < word0->len; ++i) {
3274 if (unicodeTypeL(word0->text[i])) {
3275 ++lrCount;
3276 } else if (unicodeTypeR(word0->text[i])) {
3277 --lrCount;
3283 primaryLR = lrCount >= 0;
3285 #if 0 // for debugging
3286 printf("*** direction ***\n");
3287 printf("lrCount = %d\n", lrCount);
3288 printf("primaryLR = %d\n", primaryLR);
3289 #endif
3291 //----- column assignment
3293 // sort blocks into xy order for column assignment
3294 if (blocks)
3295 gfree (blocks);
3296 if (physLayout && fixedPitch) {
3298 blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3299 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3300 blocks[i] = blk;
3301 col1 = 0; // make gcc happy
3302 switch (primaryRot) {
3303 case 0:
3304 col1 = (int)(blk->xMin / fixedPitch + 0.5);
3305 break;
3306 case 1:
3307 col1 = (int)(blk->yMin / fixedPitch + 0.5);
3308 break;
3309 case 2:
3310 col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
3311 break;
3312 case 3:
3313 col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
3314 break;
3316 blk->col = col1;
3317 for (line = blk->lines; line; line = line->next) {
3318 for (j = 0; j <= line->len; ++j) {
3319 line->col[j] += col1;
3324 } else {
3326 // sort blocks into xy order for column assignment
3327 blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3328 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3329 blocks[i] = blk;
3331 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
3333 // column assignment
3334 for (i = 0; i < nBlocks; ++i) {
3335 blk0 = blocks[i];
3336 col1 = 0;
3337 for (j = 0; j < i; ++j) {
3338 blk1 = blocks[j];
3339 col2 = 0; // make gcc happy
3340 switch (primaryRot) {
3341 case 0:
3342 if (blk0->xMin > blk1->xMax) {
3343 col2 = blk1->col + blk1->nColumns + 3;
3344 } else if (blk1->xMax == blk1->xMin) {
3345 col2 = blk1->col;
3346 } else {
3347 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
3348 (blk1->xMax - blk1->xMin)) *
3349 blk1->nColumns);
3351 break;
3352 case 1:
3353 if (blk0->yMin > blk1->yMax) {
3354 col2 = blk1->col + blk1->nColumns + 3;
3355 } else if (blk1->yMax == blk1->yMin) {
3356 col2 = blk1->col;
3357 } else {
3358 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
3359 (blk1->yMax - blk1->yMin)) *
3360 blk1->nColumns);
3362 break;
3363 case 2:
3364 if (blk0->xMax < blk1->xMin) {
3365 col2 = blk1->col + blk1->nColumns + 3;
3366 } else if (blk1->xMin == blk1->xMax) {
3367 col2 = blk1->col;
3368 } else {
3369 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
3370 (blk1->xMin - blk1->xMax)) *
3371 blk1->nColumns);
3373 break;
3374 case 3:
3375 if (blk0->yMax < blk1->yMin) {
3376 col2 = blk1->col + blk1->nColumns + 3;
3377 } else if (blk1->yMin == blk1->yMax) {
3378 col2 = blk1->col;
3379 } else {
3380 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
3381 (blk1->yMin - blk1->yMax)) *
3382 blk1->nColumns);
3384 break;
3386 if (col2 > col1) {
3387 col1 = col2;
3390 blk0->col = col1;
3391 for (line = blk0->lines; line; line = line->next) {
3392 for (j = 0; j <= line->len; ++j) {
3393 line->col[j] += col1;
3400 #if 0 // for debugging
3401 printf("*** blocks, after column assignment ***\n");
3402 for (blk = blkList; blk; blk = blk->next) {
3403 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3404 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
3405 blk->nColumns);
3406 for (line = blk->lines; line; line = line->next) {
3407 printf(" line: col[0]=%d\n", line->col[0]);
3408 for (word0 = line->words; word0; word0 = word0->next) {
3409 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3410 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3411 word0->base, word0->fontSize, word0->spaceAfter);
3412 for (i = 0; i < word0->len; ++i) {
3413 fputc(word0->text[i] & 0xff, stdout);
3415 printf("'\n");
3419 printf("\n");
3420 #endif
3422 //----- reading order sort
3424 // compute space on left and right sides of each block
3425 for (i = 0; i < nBlocks; ++i) {
3426 blk0 = blocks[i];
3427 for (j = 0; j < nBlocks; ++j) {
3428 blk1 = blocks[j];
3429 if (blk1 != blk0) {
3430 blk0->updatePriMinMax(blk1);
3435 #if 0 // for debugging
3436 printf("PAGE\n");
3437 #endif
3439 int sortPos = 0;
3440 GBool *visited = (GBool *)gmallocn(nBlocks, sizeof(GBool));
3441 for (i = 0; i < nBlocks; i++) {
3442 visited[i] = gFalse;
3445 double bxMin0, byMin0, bxMin1, byMin1;
3446 int numTables = 0;
3447 int tableId = -1;
3448 int correspondenceX, correspondenceY;
3449 double xCentre1, yCentre1, xCentre2, yCentre2;
3450 double xCentre3, yCentre3, xCentre4, yCentre4;
3451 double deltaX, deltaY;
3452 TextBlock *fblk2 = NULL, *fblk3 = NULL, *fblk4 = NULL;
3454 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3455 blk1->ExMin = blk1->xMin;
3456 blk1->ExMax = blk1->xMax;
3457 blk1->EyMin = blk1->yMin;
3458 blk1->EyMax = blk1->yMax;
3460 bxMin0 = DBL_MAX;
3461 byMin0 = DBL_MAX;
3462 bxMin1 = DBL_MAX;
3463 byMin1 = DBL_MAX;
3465 fblk2 = NULL;
3466 fblk3 = NULL;
3467 fblk4 = NULL;
3469 /* find fblk2, fblk3 and fblk4 so that
3470 * fblk2 is on the right of blk1 and overlap with blk1 in y axis
3471 * fblk3 is under blk1 and overlap with blk1 in x axis
3472 * fblk4 is under blk1 and on the right of blk1
3473 * and they are closest to blk1
3475 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3476 if (blk2 != blk1) {
3477 if (blk2->yMin <= blk1->yMax &&
3478 blk2->yMax >= blk1->yMin &&
3479 blk2->xMin > blk1->xMax &&
3480 blk2->xMin < bxMin0) {
3481 bxMin0 = blk2->xMin;
3482 fblk2 = blk2;
3483 } else if (blk2->xMin <= blk1->xMax &&
3484 blk2->xMax >= blk1->xMin &&
3485 blk2->yMin > blk1->yMax &&
3486 blk2->yMin < byMin0) {
3487 byMin0 = blk2->yMin;
3488 fblk3 = blk2;
3489 } else if (blk2->xMin > blk1->xMax &&
3490 blk2->xMin < bxMin1 &&
3491 blk2->yMin > blk1->yMax &&
3492 blk2->yMin < byMin1) {
3493 bxMin1 = blk2->xMin;
3494 byMin1 = blk2->yMin;
3495 fblk4 = blk2;
3500 /* fblk4 can not overlap with fblk3 in x and with fblk2 in y
3501 * fblk2 can not overlap with fblk3 in x and y
3502 * fblk4 has to overlap with fblk3 in y and with fblk2 in x
3504 if (fblk2 != NULL &&
3505 fblk3 != NULL &&
3506 fblk4 != NULL) {
3507 if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) ||
3508 (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) ||
3509 (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin) ||
3510 (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin)) ||
3511 !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin &&
3512 fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3513 fblk2 = NULL;
3514 fblk3 = NULL;
3515 fblk4 = NULL;
3519 // if we found any then look whether they form a table
3520 if (fblk2 != NULL &&
3521 fblk3 != NULL &&
3522 fblk4 != NULL) {
3523 tableId = -1;
3524 correspondenceX = 0;
3525 correspondenceY = 0;
3526 deltaX = 0.0;
3527 deltaY = 0.0;
3529 if (blk1->lines && blk1->lines->words)
3530 deltaX = blk1->lines->words->getFontSize();
3531 if (fblk2->lines && fblk2->lines->words)
3532 deltaX = deltaX < fblk2->lines->words->getFontSize() ?
3533 deltaX : fblk2->lines->words->getFontSize();
3534 if (fblk3->lines && fblk3->lines->words)
3535 deltaX = deltaX < fblk3->lines->words->getFontSize() ?
3536 deltaX : fblk3->lines->words->getFontSize();
3537 if (fblk4->lines && fblk4->lines->words)
3538 deltaX = deltaX < fblk4->lines->words->getFontSize() ?
3539 deltaX : fblk4->lines->words->getFontSize();
3541 deltaY = deltaX;
3543 deltaX *= minColSpacing1;
3544 deltaY *= maxIntraLineDelta;
3546 xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3547 yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3548 xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3549 yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3550 xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3551 yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3552 xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3553 yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3555 // are blocks centrally aligned in x ?
3556 if (fabs (xCentre1 - xCentre3) <= deltaX &&
3557 fabs (xCentre2 - xCentre4) <= deltaX)
3558 correspondenceX++;
3560 // are blocks centrally aligned in y ?
3561 if (fabs (yCentre1 - yCentre2) <= deltaY &&
3562 fabs (yCentre3 - yCentre4) <= deltaY)
3563 correspondenceY++;
3565 // are blocks aligned to the left ?
3566 if (fabs (blk1->xMin - fblk3->xMin) <= deltaX &&
3567 fabs (fblk2->xMin - fblk4->xMin) <= deltaX)
3568 correspondenceX++;
3570 // are blocks aligned to the right ?
3571 if (fabs (blk1->xMax - fblk3->xMax) <= deltaX &&
3572 fabs (fblk2->xMax - fblk4->xMax) <= deltaX)
3573 correspondenceX++;
3575 // are blocks aligned to the top ?
3576 if (fabs (blk1->yMin - fblk2->yMin) <= deltaY &&
3577 fabs (fblk3->yMin - fblk4->yMin) <= deltaY)
3578 correspondenceY++;
3580 // are blocks aligned to the bottom ?
3581 if (fabs (blk1->yMax - fblk2->yMax) <= deltaY &&
3582 fabs (fblk3->yMax - fblk4->yMax) <= deltaY)
3583 correspondenceY++;
3585 // are blocks aligned in x and y ?
3586 if (correspondenceX > 0 &&
3587 correspondenceY > 0) {
3589 // find maximal tableId
3590 tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3591 tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3592 tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3593 tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3595 // if the tableId is -1, then we found new table
3596 if (tableId < 0) {
3597 tableId = numTables;
3598 numTables++;
3601 blk1->tableId = tableId;
3602 fblk2->tableId = tableId;
3603 fblk3->tableId = tableId;
3604 fblk4->tableId = tableId;
3609 /* set extended bounding boxes of all table entries
3610 * so that they contain whole table
3611 * (we need to process whole table size when comparing it
3612 * with regular text blocks)
3614 PDFRectangle *envelopes = new PDFRectangle [numTables];
3615 TextBlock **ending_blocks = new TextBlock* [numTables];
3617 for (i = 0; i < numTables; i++) {
3618 envelopes[i].x1 = DBL_MAX;
3619 envelopes[i].x2 = DBL_MIN;
3620 envelopes[i].y1 = DBL_MAX;
3621 envelopes[i].y2 = DBL_MIN;
3624 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3625 if (blk1->tableId >= 0) {
3626 if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3627 envelopes[blk1->tableId].x1 = blk1->ExMin;
3628 if (!blk1->page->primaryLR)
3629 ending_blocks[blk1->tableId] = blk1;
3632 if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3633 envelopes[blk1->tableId].x2 = blk1->ExMax;
3634 if (blk1->page->primaryLR)
3635 ending_blocks[blk1->tableId] = blk1;
3638 envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ?
3639 blk1->EyMin : envelopes[blk1->tableId].y1;
3640 envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ?
3641 blk1->EyMax : envelopes[blk1->tableId].y2;
3645 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3646 if (blk1->tableId >= 0 &&
3647 blk1->xMin <= ending_blocks[blk1->tableId]->xMax &&
3648 blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3649 blk1->tableEnd = gTrue;
3653 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3654 if (blk1->tableId >= 0) {
3655 blk1->ExMin = envelopes[blk1->tableId].x1;
3656 blk1->ExMax = envelopes[blk1->tableId].x2;
3657 blk1->EyMin = envelopes[blk1->tableId].y1;
3658 blk1->EyMax = envelopes[blk1->tableId].y2;
3661 delete[] envelopes;
3662 delete[] ending_blocks;
3665 /* set extended bounding boxes of all other blocks
3666 * so that they extend in x without hitting neighbours
3668 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3669 if (!(blk1->tableId >= 0)) {
3670 double xMax = DBL_MAX;
3671 double xMin = DBL_MIN;
3673 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3674 if (blk2 == blk1)
3675 continue;
3677 if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3678 if (blk2->xMin < xMax && blk2->xMin > blk1->xMax)
3679 xMax = blk2->xMin;
3681 if (blk2->xMax > xMin && blk2->xMax < blk1->xMin)
3682 xMin = blk2->xMax;
3686 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3687 if (blk2 == blk1)
3688 continue;
3690 if (blk2->xMax > blk1->ExMax &&
3691 blk2->xMax <= xMax &&
3692 blk2->yMin >= blk1->yMax) {
3693 blk1->ExMax = blk2->xMax;
3696 if (blk2->xMin < blk1->ExMin &&
3697 blk2->xMin >= xMin &&
3698 blk2->yMin >= blk1->yMax)
3699 blk1->ExMin = blk2->xMin;
3704 i = -1;
3705 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3706 i++;
3707 sortPos = blk1->visitDepthFirst(blkList, i, blocks, sortPos, visited);
3709 if (visited) {
3710 gfree(visited);
3713 #if 0 // for debugging
3714 printf("*** blocks, after ro sort ***\n");
3715 for (i = 0; i < nBlocks; ++i) {
3716 blk = blocks[i];
3717 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3718 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3719 blk->priMin, blk->priMax);
3720 for (line = blk->lines; line; line = line->next) {
3721 printf(" line:\n");
3722 for (word0 = line->words; word0; word0 = word0->next) {
3723 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3724 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3725 word0->base, word0->fontSize, word0->spaceAfter);
3726 for (j = 0; j < word0->len; ++j) {
3727 fputc(word0->text[j] & 0xff, stdout);
3729 printf("'\n");
3733 printf("\n");
3734 fflush(stdout);
3735 #endif
3737 // build the flows
3738 //~ this needs to be adjusted for writing mode (vertical text)
3739 //~ this also needs to account for right-to-left column ordering
3740 flow = NULL;
3741 while (flows) {
3742 flow = flows;
3743 flows = flows->next;
3744 delete flow;
3746 flows = lastFlow = NULL;
3747 // assume blocks are already in reading order,
3748 // and construct flows accordingly.
3749 for (i = 0; i < nBlocks; i++) {
3750 blk = blocks[i];
3751 blk->next = NULL;
3752 if (flow) {
3753 blk1 = blocks[i - 1];
3754 blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3755 if (blk1->secondaryDelta(blk) <= blkSpace &&
3756 blk->isBelow(blk1) &&
3757 flow->blockFits(blk, blk1)) {
3758 flow->addBlock(blk);
3759 continue;
3762 flow = new TextFlow(this, blk);
3763 if (lastFlow) {
3764 lastFlow->next = flow;
3765 } else {
3766 flows = flow;
3768 lastFlow = flow;
3771 #if 0 // for debugging
3772 printf("*** flows ***\n");
3773 for (flow = flows; flow; flow = flow->next) {
3774 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3775 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3776 flow->priMin, flow->priMax);
3777 for (blk = flow->blocks; blk; blk = blk->next) {
3778 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3779 blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3780 blk->priMin, blk->priMax);
3781 for (line = blk->lines; line; line = line->next) {
3782 printf(" line:\n");
3783 for (word0 = line->words; word0; word0 = word0->next) {
3784 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3785 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3786 word0->base, word0->fontSize, word0->spaceAfter);
3787 for (i = 0; i < word0->len; ++i) {
3788 fputc(word0->text[i] & 0xff, stdout);
3790 printf("'\n");
3795 printf("\n");
3796 #endif
3798 if (uMap) {
3799 uMap->decRefCnt();
3803 GBool TextPage::findText(Unicode *s, int len,
3804 GBool startAtTop, GBool stopAtBottom,
3805 GBool startAtLast, GBool stopAtLast,
3806 GBool caseSensitive, GBool backward,
3807 GBool wholeWord,
3808 double *xMin, double *yMin,
3809 double *xMax, double *yMax) {
3810 TextBlock *blk;
3811 TextLine *line;
3812 Unicode *s2, *txt, *reordered;
3813 Unicode *p;
3814 int txtSize, m, i, j, k;
3815 double xStart, yStart, xStop, yStop;
3816 double xMin0, yMin0, xMax0, yMax0;
3817 double xMin1, yMin1, xMax1, yMax1;
3818 GBool found;
3821 if (rawOrder) {
3822 return gFalse;
3825 // handle right-to-left text
3826 reordered = (Unicode*)gmallocn(len, sizeof(Unicode));
3827 reorderText(s, len, NULL, primaryLR, NULL, reordered);
3829 // normalize the search string
3830 s2 = unicodeNormalizeNFKC(reordered, len, &len, NULL);
3832 // convert the search string to uppercase
3833 if (!caseSensitive) {
3834 for (i = 0; i < len; ++i) {
3835 s2[i] = unicodeToUpper(s2[i]);
3839 txt = NULL;
3840 txtSize = 0;
3842 xStart = yStart = xStop = yStop = 0;
3843 if (startAtLast && haveLastFind) {
3844 xStart = lastFindXMin;
3845 yStart = lastFindYMin;
3846 } else if (!startAtTop) {
3847 xStart = *xMin;
3848 yStart = *yMin;
3850 if (stopAtLast && haveLastFind) {
3851 xStop = lastFindXMin;
3852 yStop = lastFindYMin;
3853 } else if (!stopAtBottom) {
3854 xStop = *xMax;
3855 yStop = *yMax;
3858 found = gFalse;
3859 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3860 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3862 for (i = backward ? nBlocks - 1 : 0;
3863 backward ? i >= 0 : i < nBlocks;
3864 i += backward ? -1 : 1) {
3865 blk = blocks[i];
3867 // check: is the block above the top limit?
3868 // (this only works if the page's primary rotation is zero --
3869 // otherwise the blocks won't be sorted in the useful order)
3870 if (!startAtTop && primaryRot == 0 &&
3871 (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3872 continue;
3875 // check: is the block below the bottom limit?
3876 // (this only works if the page's primary rotation is zero --
3877 // otherwise the blocks won't be sorted in the useful order)
3878 if (!stopAtBottom && primaryRot == 0 &&
3879 (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3880 break;
3883 for (line = blk->lines; line; line = line->next) {
3885 // check: is the line above the top limit?
3886 // (this only works if the page's primary rotation is zero --
3887 // otherwise the lines won't be sorted in the useful order)
3888 if (!startAtTop && primaryRot == 0 &&
3889 (backward ? line->yMin > yStart : line->yMin < yStart)) {
3890 continue;
3893 // check: is the line below the bottom limit?
3894 // (this only works if the page's primary rotation is zero --
3895 // otherwise the lines won't be sorted in the useful order)
3896 if (!stopAtBottom && primaryRot == 0 &&
3897 (backward ? line->yMin < yStop : line->yMin > yStop)) {
3898 continue;
3901 if (!line->normalized)
3902 line->normalized = unicodeNormalizeNFKC(line->text, line->len,
3903 &line->normalized_len,
3904 &line->normalized_idx,
3905 true);
3906 // convert the line to uppercase
3907 m = line->normalized_len;
3908 if (!caseSensitive) {
3909 if (m > txtSize) {
3910 txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3911 txtSize = m;
3913 for (k = 0; k < m; ++k) {
3914 txt[k] = unicodeToUpper(line->normalized[k]);
3916 } else {
3917 txt = line->normalized;
3920 // search each position in this line
3921 j = backward ? m - len : 0;
3922 p = txt + j;
3923 while (backward ? j >= 0 : j <= m - len) {
3924 if (!wholeWord ||
3925 ((j == 0 || !unicodeTypeAlphaNum(txt[j - 1])) &&
3926 (j + len == m || !unicodeTypeAlphaNum(txt[j + len])))) {
3928 // compare the strings
3929 for (k = 0; k < len; ++k) {
3930 if (p[k] != s2[k]) {
3931 break;
3935 // found it
3936 if (k == len) {
3937 // where s2 matches a subsequence of a compatibility equivalence
3938 // decomposition, highlight the entire glyph, since we don't know
3939 // the internal layout of subglyph components
3940 int normStart = line->normalized_idx[j];
3941 int normAfterEnd = line->normalized_idx[j + len - 1] + 1;
3942 switch (line->rot) {
3943 case 0:
3944 xMin1 = line->edge[normStart];
3945 xMax1 = line->edge[normAfterEnd];
3946 yMin1 = line->yMin;
3947 yMax1 = line->yMax;
3948 break;
3949 case 1:
3950 xMin1 = line->xMin;
3951 xMax1 = line->xMax;
3952 yMin1 = line->edge[normStart];
3953 yMax1 = line->edge[normAfterEnd];
3954 break;
3955 case 2:
3956 xMin1 = line->edge[normAfterEnd];
3957 xMax1 = line->edge[normStart];
3958 yMin1 = line->yMin;
3959 yMax1 = line->yMax;
3960 break;
3961 case 3:
3962 xMin1 = line->xMin;
3963 xMax1 = line->xMax;
3964 yMin1 = line->edge[normAfterEnd];
3965 yMax1 = line->edge[normStart];
3966 break;
3968 if (backward) {
3969 if ((startAtTop ||
3970 yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3971 (stopAtBottom ||
3972 yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3973 if (!found ||
3974 yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3975 xMin0 = xMin1;
3976 xMax0 = xMax1;
3977 yMin0 = yMin1;
3978 yMax0 = yMax1;
3979 found = gTrue;
3982 } else {
3983 if ((startAtTop ||
3984 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3985 (stopAtBottom ||
3986 yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3987 if (!found ||
3988 yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3989 xMin0 = xMin1;
3990 xMax0 = xMax1;
3991 yMin0 = yMin1;
3992 yMax0 = yMax1;
3993 found = gTrue;
3999 if (backward) {
4000 --j;
4001 --p;
4002 } else {
4003 ++j;
4004 ++p;
4010 gfree(s2);
4011 gfree(reordered);
4012 if (!caseSensitive) {
4013 gfree(txt);
4016 if (found) {
4017 *xMin = xMin0;
4018 *xMax = xMax0;
4019 *yMin = yMin0;
4020 *yMax = yMax0;
4021 lastFindXMin = xMin0;
4022 lastFindYMin = yMin0;
4023 haveLastFind = gTrue;
4024 return gTrue;
4027 return gFalse;
4030 GooString *TextPage::getText(double xMin, double yMin,
4031 double xMax, double yMax) {
4032 GooString *s;
4033 UnicodeMap *uMap;
4034 TextBlock *blk;
4035 TextLine *line;
4036 TextLineFrag *frags;
4037 int nFrags, fragsSize;
4038 TextLineFrag *frag;
4039 char space[8], eol[16];
4040 int spaceLen, eolLen;
4041 int lastRot;
4042 double x, y, delta;
4043 int col, idx0, idx1, i, j;
4044 GBool multiLine, oneRot;
4046 s = new GooString();
4048 // get the output encoding
4049 if (!(uMap = globalParams->getTextEncoding())) {
4050 return s;
4053 if (rawOrder) {
4054 TextWord* word;
4055 char mbc[16];
4056 int mbc_len;
4058 for (word = rawWords; word && word <= rawLastWord; word = word->next) {
4059 for (j = 0; j < word->getLength(); ++j) {
4060 double gXMin, gXMax, gYMin, gYMax;
4061 word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax);
4062 if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax)
4064 mbc_len = uMap->mapUnicode( *(word->getChar(j)), mbc, sizeof(mbc) );
4065 s->append(mbc, mbc_len);
4069 return s;
4072 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4073 eolLen = 0; // make gcc happy
4074 switch (globalParams->getTextEOL()) {
4075 case eolUnix:
4076 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4077 break;
4078 case eolDOS:
4079 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4080 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
4081 break;
4082 case eolMac:
4083 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4084 break;
4087 //~ writing mode (horiz/vert)
4089 // collect the line fragments that are in the rectangle
4090 fragsSize = 256;
4091 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
4092 nFrags = 0;
4093 lastRot = -1;
4094 oneRot = gTrue;
4095 for (i = 0; i < nBlocks; ++i) {
4096 blk = blocks[i];
4097 if (xMin < blk->xMax && blk->xMin < xMax &&
4098 yMin < blk->yMax && blk->yMin < yMax) {
4099 for (line = blk->lines; line; line = line->next) {
4100 if (xMin < line->xMax && line->xMin < xMax &&
4101 yMin < line->yMax && line->yMin < yMax) {
4102 idx0 = idx1 = -1;
4103 switch (line->rot) {
4104 case 0:
4105 y = 0.5 * (line->yMin + line->yMax);
4106 if (yMin < y && y < yMax) {
4107 j = 0;
4108 while (j < line->len) {
4109 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
4110 idx0 = j;
4111 break;
4113 ++j;
4115 j = line->len - 1;
4116 while (j >= 0) {
4117 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
4118 idx1 = j;
4119 break;
4121 --j;
4124 break;
4125 case 1:
4126 x = 0.5 * (line->xMin + line->xMax);
4127 if (xMin < x && x < xMax) {
4128 j = 0;
4129 while (j < line->len) {
4130 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
4131 idx0 = j;
4132 break;
4134 ++j;
4136 j = line->len - 1;
4137 while (j >= 0) {
4138 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
4139 idx1 = j;
4140 break;
4142 --j;
4145 break;
4146 case 2:
4147 y = 0.5 * (line->yMin + line->yMax);
4148 if (yMin < y && y < yMax) {
4149 j = 0;
4150 while (j < line->len) {
4151 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
4152 idx0 = j;
4153 break;
4155 ++j;
4157 j = line->len - 1;
4158 while (j >= 0) {
4159 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
4160 idx1 = j;
4161 break;
4163 --j;
4166 break;
4167 case 3:
4168 x = 0.5 * (line->xMin + line->xMax);
4169 if (xMin < x && x < xMax) {
4170 j = 0;
4171 while (j < line->len) {
4172 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
4173 idx0 = j;
4174 break;
4176 ++j;
4178 j = line->len - 1;
4179 while (j >= 0) {
4180 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
4181 idx1 = j;
4182 break;
4184 --j;
4187 break;
4189 if (idx0 >= 0 && idx1 >= 0) {
4190 if (nFrags == fragsSize) {
4191 fragsSize *= 2;
4192 frags = (TextLineFrag *)
4193 greallocn(frags, fragsSize, sizeof(TextLineFrag));
4195 frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
4196 ++nFrags;
4197 if (lastRot >= 0 && line->rot != lastRot) {
4198 oneRot = gFalse;
4200 lastRot = line->rot;
4207 // sort the fragments and generate the string
4208 if (nFrags > 0) {
4210 for (i = 0; i < nFrags; ++i) {
4211 frags[i].computeCoords(oneRot);
4213 assignColumns(frags, nFrags, oneRot);
4215 // if all lines in the region have the same rotation, use it;
4216 // otherwise, use the page's primary rotation
4217 if (oneRot) {
4218 qsort(frags, nFrags, sizeof(TextLineFrag),
4219 &TextLineFrag::cmpYXLineRot);
4220 } else {
4221 qsort(frags, nFrags, sizeof(TextLineFrag),
4222 &TextLineFrag::cmpYXPrimaryRot);
4224 i = 0;
4225 while (i < nFrags) {
4226 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4227 for (j = i+1;
4228 j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
4229 ++j) ;
4230 qsort(frags + i, j - i, sizeof(TextLineFrag),
4231 oneRot ? &TextLineFrag::cmpXYColumnLineRot
4232 : &TextLineFrag::cmpXYColumnPrimaryRot);
4233 i = j;
4236 col = 0;
4237 multiLine = gFalse;
4238 for (i = 0; i < nFrags; ++i) {
4239 frag = &frags[i];
4241 // insert a return
4242 if (frag->col < col ||
4243 (i > 0 && fabs(frag->base - frags[i-1].base) >
4244 maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
4245 s->append(eol, eolLen);
4246 col = 0;
4247 multiLine = gTrue;
4250 // column alignment
4251 for (; col < frag->col; ++col) {
4252 s->append(space, spaceLen);
4255 // get the fragment text
4256 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
4259 if (multiLine) {
4260 s->append(eol, eolLen);
4264 gfree(frags);
4265 uMap->decRefCnt();
4267 return s;
4270 class TextSelectionVisitor {
4271 public:
4272 TextSelectionVisitor (TextPage *page);
4273 virtual ~TextSelectionVisitor () { }
4274 virtual void visitBlock (TextBlock *block,
4275 TextLine *begin,
4276 TextLine *end,
4277 PDFRectangle *selection) = 0;
4278 virtual void visitLine (TextLine *line,
4279 TextWord *begin,
4280 TextWord *end,
4281 int edge_begin,
4282 int edge_end,
4283 PDFRectangle *selection) = 0;
4284 virtual void visitWord (TextWord *word, int begin, int end,
4285 PDFRectangle *selection) = 0;
4287 protected:
4288 TextPage *page;
4291 TextSelectionVisitor::TextSelectionVisitor (TextPage *page)
4292 : page(page)
4297 class TextSelectionDumper : public TextSelectionVisitor {
4298 public:
4299 TextSelectionDumper(TextPage *page);
4300 virtual ~TextSelectionDumper();
4302 virtual void visitBlock (TextBlock *block,
4303 TextLine *begin,
4304 TextLine *end,
4305 PDFRectangle *selection) { };
4306 virtual void visitLine (TextLine *line,
4307 TextWord *begin,
4308 TextWord *end,
4309 int edge_begin,
4310 int edge_end,
4311 PDFRectangle *selection);
4312 virtual void visitWord (TextWord *word, int begin, int end,
4313 PDFRectangle *selection);
4314 void endPage();
4316 GooString *getText(void);
4317 GooList **takeWordList(int *nLines);
4319 private:
4321 void startLine();
4322 void finishLine();
4324 GooList **lines;
4325 int nLines, linesSize;
4326 GooList *words;
4327 int tableId;
4328 TextBlock *currentBlock;
4331 TextSelectionDumper::TextSelectionDumper(TextPage *page)
4332 : TextSelectionVisitor(page)
4334 linesSize = 256;
4335 lines = (GooList **)gmallocn(linesSize, sizeof(GooList *));
4336 nLines = 0;
4338 tableId = -1;
4339 currentBlock = NULL;
4340 words = NULL;
4343 TextSelectionDumper::~TextSelectionDumper()
4345 for (int i = 0; i < nLines; i++)
4346 deleteGooList(lines[i], TextWordSelection);
4347 gfree(lines);
4350 void TextSelectionDumper::startLine()
4352 finishLine();
4353 words = new GooList();
4356 void TextSelectionDumper::finishLine()
4358 if (nLines == linesSize) {
4359 linesSize *= 2;
4360 lines = (GooList **)grealloc(lines, linesSize * sizeof(GooList *));
4363 if (words && words->getLength() > 0)
4364 lines[nLines++] = words;
4365 else if (words)
4366 delete words;
4367 words = NULL;
4370 void TextSelectionDumper::visitLine (TextLine *line,
4371 TextWord *begin,
4372 TextWord *end,
4373 int edge_begin,
4374 int edge_end,
4375 PDFRectangle *selection)
4377 TextLineFrag frag;
4379 frag.init(line, edge_begin, edge_end - edge_begin);
4381 if (tableId >= 0 && frag.line->blk->tableId < 0) {
4382 finishLine();
4384 tableId = -1;
4385 currentBlock = NULL;
4388 if (frag.line->blk->tableId >= 0) { // a table
4389 if (tableId == -1) {
4390 tableId = frag.line->blk->tableId;
4391 currentBlock = frag.line->blk;
4394 if (currentBlock == frag.line->blk) { // the same block
4395 startLine();
4396 } else { // another block
4397 if (currentBlock->tableEnd) { // previous block ended its row
4398 startLine();
4400 currentBlock = frag.line->blk;
4402 } else { // not a table
4403 startLine();
4407 void TextSelectionDumper::visitWord (TextWord *word, int begin, int end,
4408 PDFRectangle *selection)
4410 words->append(new TextWordSelection(word, begin, end));
4413 void TextSelectionDumper::endPage()
4415 finishLine();
4418 GooString *TextSelectionDumper::getText (void)
4420 GooString *text;
4421 int i, j;
4422 UnicodeMap *uMap;
4423 char space[8], eol[16];
4424 int spaceLen, eolLen;
4426 text = new GooString();
4428 if (!(uMap = globalParams->getTextEncoding()))
4429 return text;
4431 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4432 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4434 for (i = 0; i < nLines; i++) {
4435 GooList *lineWords = lines[i];
4436 for (j = 0; j < lineWords->getLength(); j++) {
4437 TextWordSelection *sel = (TextWordSelection *)lineWords->get(j);
4439 page->dumpFragment (sel->word->text + sel->begin, sel->end - sel->begin, uMap, text);
4440 if (j < lineWords->getLength() - 1)
4441 text->append(space, spaceLen);
4443 if (i < nLines - 1)
4444 text->append(eol, eolLen);
4447 uMap->decRefCnt();
4449 return text;
4452 GooList **TextSelectionDumper::takeWordList(int *nLinesOut)
4454 GooList **returnValue = lines;
4456 *nLinesOut = nLines;
4457 if (nLines == 0)
4458 return NULL;
4460 nLines = 0;
4461 lines = NULL;
4463 return returnValue;
4466 class TextSelectionSizer : public TextSelectionVisitor {
4467 public:
4468 TextSelectionSizer(TextPage *page, double scale);
4469 ~TextSelectionSizer() { }
4471 virtual void visitBlock (TextBlock *block,
4472 TextLine *begin,
4473 TextLine *end,
4474 PDFRectangle *selection) { };
4475 virtual void visitLine (TextLine *line,
4476 TextWord *begin,
4477 TextWord *end,
4478 int edge_begin,
4479 int edge_end,
4480 PDFRectangle *selection);
4481 virtual void visitWord (TextWord *word, int begin, int end,
4482 PDFRectangle *selection) { };
4484 GooList *getRegion () { return list; }
4486 private:
4487 GooList *list;
4488 double scale;
4491 TextSelectionSizer::TextSelectionSizer(TextPage *page, double scale)
4492 : TextSelectionVisitor(page),
4493 scale(scale)
4495 list = new GooList();
4498 void TextSelectionSizer::visitLine (TextLine *line,
4499 TextWord *begin,
4500 TextWord *end,
4501 int edge_begin,
4502 int edge_end,
4503 PDFRectangle *selection)
4505 PDFRectangle *rect;
4506 double x1, y1, x2, y2, margin;
4508 margin = (line->yMax - line->yMin) / 8;
4509 x1 = line->edge[edge_begin];
4510 y1 = line->yMin - margin;
4511 x2 = line->edge[edge_end];
4512 y2 = line->yMax + margin;
4514 rect = new PDFRectangle (floor (x1 * scale),
4515 floor (y1 * scale),
4516 ceil (x2 * scale),
4517 ceil (y2 * scale));
4518 list->append (rect);
4522 class TextSelectionPainter : public TextSelectionVisitor {
4523 public:
4524 TextSelectionPainter(TextPage *page,
4525 double scale,
4526 int rotation,
4527 OutputDev *out,
4528 GfxColor *box_color,
4529 GfxColor *glyph_color);
4530 ~TextSelectionPainter();
4532 virtual void visitBlock (TextBlock *block,
4533 TextLine *begin,
4534 TextLine *end,
4535 PDFRectangle *selection) { };
4536 virtual void visitLine (TextLine *line,
4537 TextWord *begin,
4538 TextWord *end,
4539 int edge_begin,
4540 int edge_end,
4541 PDFRectangle *selection);
4542 virtual void visitWord (TextWord *word, int begin, int end,
4543 PDFRectangle *selection);
4544 void endPage();
4546 private:
4547 OutputDev *out;
4548 GfxColor *box_color, *glyph_color;
4549 GfxState *state;
4550 GooList *selectionList;
4551 Matrix ctm, ictm;
4554 TextSelectionPainter::TextSelectionPainter(TextPage *page,
4555 double scale,
4556 int rotation,
4557 OutputDev *out,
4558 GfxColor *box_color,
4559 GfxColor *glyph_color)
4560 : TextSelectionVisitor(page),
4561 out(out),
4562 box_color(box_color),
4563 glyph_color(glyph_color)
4565 PDFRectangle box(0, 0, page->pageWidth, page->pageHeight);
4567 selectionList = new GooList();
4568 state = new GfxState(72 * scale, 72 * scale, &box, rotation, gFalse);
4570 state->getCTM(&ctm);
4571 ctm.invertTo(&ictm);
4573 out->startPage(0, state, NULL);
4574 out->setDefaultCTM (state->getCTM());
4576 state->setFillColorSpace(new GfxDeviceRGBColorSpace());
4577 state->setFillColor(box_color);
4578 out->updateFillColor(state);
4581 TextSelectionPainter::~TextSelectionPainter()
4583 deleteGooList(selectionList, TextWordSelection);
4584 delete state;
4587 void TextSelectionPainter::visitLine (TextLine *line,
4588 TextWord *begin,
4589 TextWord *end,
4590 int edge_begin,
4591 int edge_end,
4592 PDFRectangle *selection)
4594 double x1, y1, x2, y2, margin;
4596 margin = (line->yMax - line->yMin) / 8;
4597 x1 = floor (line->edge[edge_begin]);
4598 y1 = floor (line->yMin - margin);
4599 x2 = ceil (line->edge[edge_end]);
4600 y2 = ceil (line->yMax + margin);
4602 ctm.transform(line->edge[edge_begin], line->yMin - margin, &x1, &y1);
4603 ctm.transform(line->edge[edge_end], line->yMax + margin, &x2, &y2);
4605 x1 = floor (x1);
4606 y1 = floor (y1);
4607 x2 = ceil (x2);
4608 y2 = ceil (y2);
4610 ictm.transform(x1, y1, &x1, &y1);
4611 ictm.transform(x2, y2, &x2, &y2);
4613 state->moveTo(x1, y1);
4614 state->lineTo(x2, y1);
4615 state->lineTo(x2, y2);
4616 state->lineTo(x1, y2);
4617 state->closePath();
4620 void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
4621 PDFRectangle *selection)
4623 selectionList->append(new TextWordSelection(word, begin, end));
4626 void TextSelectionPainter::endPage()
4628 out->fill(state);
4630 out->saveState(state);
4631 out->clip(state);
4633 state->clearPath();
4635 state->setFillColor(glyph_color);
4636 out->updateFillColor(state);
4638 for (int i = 0; i < selectionList->getLength(); i++) {
4639 TextWordSelection *sel = (TextWordSelection *) selectionList->get(i);
4640 int begin = sel->begin;
4642 while (begin < sel->end) {
4643 TextFontInfo *font = sel->word->font[begin];
4644 font->gfxFont->incRefCnt();
4645 Matrix *mat = &sel->word->textMat[begin];
4647 state->setTextMat(mat->m[0], mat->m[1], mat->m[2], mat->m[3], 0, 0);
4648 state->setFont(font->gfxFont, 1);
4649 out->updateFont(state);
4651 int fEnd = begin + 1;
4652 while (fEnd < sel->end && font->matches(sel->word->font[fEnd]) &&
4653 mat->m[0] == sel->word->textMat[fEnd].m[0] &&
4654 mat->m[1] == sel->word->textMat[fEnd].m[1] &&
4655 mat->m[2] == sel->word->textMat[fEnd].m[2] &&
4656 mat->m[3] == sel->word->textMat[fEnd].m[3])
4657 fEnd++;
4659 /* The only purpose of this string is to let the output device query
4660 * it's length. Might want to change this interface later. */
4661 GooString *string = new GooString ((char *) sel->word->charcode, fEnd - begin);
4662 out->beginString(state, string);
4664 for (int i = begin; i < fEnd; i++) {
4665 if (i != begin && sel->word->charPos[i] == sel->word->charPos[i - 1])
4666 continue;
4668 out->drawChar(state, sel->word->textMat[i].m[4], sel->word->textMat[i].m[5], 0, 0, 0, 0,
4669 sel->word->charcode[i], 1, NULL, 0);
4671 out->endString(state);
4672 delete string;
4673 begin = fEnd;
4677 out->restoreState(state);
4678 out->endPage ();
4681 void TextWord::visitSelection(TextSelectionVisitor *visitor,
4682 PDFRectangle *selection,
4683 SelectionStyle style)
4685 int i, begin, end;
4686 double mid;
4688 begin = len;
4689 end = 0;
4690 for (i = 0; i < len; i++) {
4691 mid = (edge[i] + edge[i + 1]) / 2;
4692 if (selection->x1 < mid || selection->x2 < mid)
4693 if (i < begin)
4694 begin = i;
4695 if (mid < selection->x1 || mid < selection->x2)
4696 end = i + 1;
4699 /* Skip empty selection. */
4700 if (end <= begin)
4701 return;
4703 visitor->visitWord (this, begin, end, selection);
4706 void TextLine::visitSelection(TextSelectionVisitor *visitor,
4707 PDFRectangle *selection,
4708 SelectionStyle style) {
4709 TextWord *p, *begin, *end, *current;
4710 int i, edge_begin, edge_end;
4711 PDFRectangle child_selection;
4713 begin = NULL;
4714 end = NULL;
4715 current = NULL;
4716 for (p = words; p != NULL; p = p->next) {
4717 if (blk->page->primaryLR) {
4718 if ((selection->x1 < p->xMax) ||
4719 (selection->x2 < p->xMax))
4720 if (begin == NULL)
4721 begin = p;
4723 if (((selection->x1 > p->xMin) ||
4724 (selection->x2 > p->xMin)) && (begin != NULL)) {
4725 end = p->next;
4726 current = p;
4728 } else {
4729 if ((selection->x1 > p->xMin) ||
4730 (selection->x2 > p->xMin))
4731 if (begin == NULL)
4732 begin = p;
4734 if (((selection->x1 < p->xMax) ||
4735 (selection->x2 < p->xMax)) && (begin != NULL)) {
4736 end = p->next;
4737 current = p;
4742 if (!current)
4743 current = begin;
4745 child_selection = *selection;
4746 if (style == selectionStyleWord) {
4747 child_selection.x1 = begin ? begin->xMin : xMin;
4748 if (end && end->xMax != -1) {
4749 child_selection.x2 = current->xMax;
4750 } else {
4751 child_selection.x2 = xMax;
4755 edge_begin = len;
4756 edge_end = 0;
4757 for (i = 0; i < len; i++) {
4758 double mid = (edge[i] + edge[i + 1]) / 2;
4759 if (child_selection.x1 < mid || child_selection.x2 < mid)
4760 if (i < edge_begin)
4761 edge_begin = i;
4762 if (mid < child_selection.x2 || mid < child_selection.x1)
4763 edge_end = i + 1;
4766 /* Skip empty selection. */
4767 if (edge_end <= edge_begin)
4768 return;
4770 visitor->visitLine (this, begin, end, edge_begin, edge_end,
4771 &child_selection);
4773 for (p = begin; p != end; p = p->next)
4774 p->visitSelection (visitor, &child_selection, style);
4777 void TextBlock::visitSelection(TextSelectionVisitor *visitor,
4778 PDFRectangle *selection,
4779 SelectionStyle style) {
4780 PDFRectangle child_selection;
4781 double x[2], y[2], d, best_d[2];
4782 TextLine *p, *best_line[2];
4783 int i, count = 0, best_count[2], start, stop;
4784 GBool all[2];
4786 x[0] = selection->x1;
4787 y[0] = selection->y1;
4788 x[1] = selection->x2;
4789 y[1] = selection->y2;
4791 for (i = 0; i < 2; i++) {
4792 // the first/last lines are often not nearest
4793 // the corners, so we have to force them to be
4794 // selected when the selection runs outside this
4795 // block.
4796 if (page->primaryLR) {
4797 all[i] = x[i] >= this->xMax && y[i] >= this->yMax;
4798 if (x[i] <= this->xMin && y[i] <= this->yMin) {
4799 best_line[i] = this->lines;
4800 best_count[i] = 1;
4801 } else {
4802 best_line[i] = NULL;
4803 best_count[i] = 0;
4805 } else {
4806 all[i] = x[i] <= this->xMin && y[i] >= this->yMax;
4807 if (x[i] >= this->xMax && y[i] <= this->yMin) {
4808 best_line[i] = this->lines;
4809 best_count[i] = 1;
4810 } else {
4811 best_line[i] = NULL;
4812 best_count[i] = 0;
4815 best_d[i] = 0;
4818 // find the nearest line to the selection points
4819 // using the manhattan distance.
4820 for (p = this->lines; p; p = p->next) {
4821 count++;
4822 for (i = 0; i < 2; i++) {
4823 d = fmax(p->xMin - x[i], 0.0) +
4824 fmax(x[i] - p->xMax, 0.0) +
4825 fmax(p->yMin - y[i], 0.0) +
4826 fmax(y[i] - p->yMax, 0.0);
4827 if (!best_line[i] || all[i] ||
4828 d < best_d[i]) {
4829 best_line[i] = p;
4830 best_count[i] = count;
4831 best_d[i] = d;
4835 // assert: best is always set.
4836 if (!best_line[0] || !best_line[1]) {
4837 return;
4840 // Now decide which point was first.
4841 if (best_count[0] < best_count[1] ||
4842 (best_count[0] == best_count[1] &&
4843 y[0] < y[1])) {
4844 start = 0;
4845 stop = 1;
4846 } else {
4847 start = 1;
4848 stop = 0;
4851 visitor->visitBlock(this, best_line[start], best_line[stop], selection);
4853 for (p = best_line[start]; p; p = p->next) {
4854 if (page->primaryLR) {
4855 child_selection.x1 = p->xMin;
4856 child_selection.x2 = p->xMax;
4857 } else {
4858 child_selection.x1 = p->xMax;
4859 child_selection.x2 = p->xMin;
4861 child_selection.y1 = p->yMin;
4862 child_selection.y2 = p->yMax;
4863 if (style == selectionStyleLine) {
4864 if (p == best_line[start]) {
4865 child_selection.x1 = 0;
4866 child_selection.y1 = 0;
4868 if (p == best_line[stop]) {
4869 child_selection.x2 = page->pageWidth;
4870 child_selection.y2 = page->pageHeight;
4872 } else {
4873 if (p == best_line[start]) {
4874 child_selection.x1 = fmax(p->xMin, fmin(p->xMax, x[start]));
4875 child_selection.y1 = fmax(p->yMin, fmin(p->yMax, y[start]));
4877 if (p == best_line[stop]) {
4878 child_selection.x2 = fmax(p->xMin, fmin(p->xMax, x[stop]));
4879 child_selection.y2 = fmax(p->yMin, fmin(p->yMax, y[stop]));
4882 p->visitSelection(visitor, &child_selection, style);
4883 if (p == best_line[stop]) {
4884 return;
4889 void TextPage::visitSelection(TextSelectionVisitor *visitor,
4890 PDFRectangle *selection,
4891 SelectionStyle style)
4893 PDFRectangle child_selection;
4894 double x[2], y[2], d, best_d[2];
4895 double xMin, yMin, xMax, yMax;
4896 TextFlow *flow, *best_flow[2];
4897 TextBlock *blk, *best_block[2];
4898 int i, count = 0, best_count[2], start, stop;
4900 if (!flows)
4901 return;
4903 x[0] = selection->x1;
4904 y[0] = selection->y1;
4905 x[1] = selection->x2;
4906 y[1] = selection->y2;
4908 xMin = pageWidth;
4909 yMin = pageHeight;
4910 xMax = 0.0;
4911 yMax = 0.0;
4913 for (i = 0; i < 2; i++) {
4914 best_block[i] = NULL;
4915 best_flow[i] = NULL;
4916 best_count[i] = 0;
4917 best_d[i] = 0;
4920 // find the nearest blocks to the selection points
4921 // using the manhattan distance.
4922 for (flow = flows; flow; flow = flow->next) {
4923 for (blk = flow->blocks; blk; blk = blk->next) {
4924 count++;
4925 // the first/last blocks in reading order are
4926 // often not the closest to the page corners;
4927 // track the corners, force those blocks to
4928 // be selected if the selection runs across
4929 // multiple pages.
4930 xMin = fmin(xMin, blk->xMin);
4931 yMin = fmin(yMin, blk->yMin);
4932 xMax = fmax(xMax, blk->xMax);
4933 yMax = fmax(yMax, blk->yMax);
4934 for (i = 0; i < 2; i++) {
4935 d = fmax(blk->xMin - x[i], 0.0) +
4936 fmax(x[i] - blk->xMax, 0.0) +
4937 fmax(blk->yMin - y[i], 0.0) +
4938 fmax(y[i] - blk->yMax, 0.0);
4939 if (!best_block[i] ||
4940 d < best_d[i] ||
4941 (!blk->next && !flow->next &&
4942 x[i] >= fmin(xMax, pageWidth) &&
4943 y[i] >= fmin(yMax, pageHeight))) {
4944 best_block[i] = blk;
4945 best_flow[i] = flow;
4946 best_count[i] = count;
4947 best_d[i] = d;
4952 for (i = 0; i < 2; i++) {
4953 if (primaryLR) {
4954 if (x[i] < xMin && y[i] < yMin) {
4955 best_block[i] = flows->blocks;
4956 best_flow[i] = flows;
4957 best_count[i] = 1;
4959 } else {
4960 if (x[i] > xMax && y[i] < yMin) {
4961 best_block[i] = flows->blocks;
4962 best_flow[i] = flows;
4963 best_count[i] = 1;
4967 // assert: best is always set.
4968 if (!best_block[0] || !best_block[1]) {
4969 return;
4972 // Now decide which point was first.
4973 if (best_count[0] < best_count[1] ||
4974 (best_count[0] == best_count[1] && y[0] < y[1])) {
4975 start = 0;
4976 stop = 1;
4977 } else {
4978 start = 1;
4979 stop = 0;
4982 for (flow = best_flow[start]; flow; flow = flow->next) {
4983 if (flow == best_flow[start]) {
4984 blk = best_block[start];
4985 } else {
4986 blk = flow->blocks;
4988 for (; blk; blk = blk->next) {
4989 if (primaryLR) {
4990 child_selection.x1 = blk->xMin;
4991 child_selection.x2 = blk->xMax;
4992 } else {
4993 child_selection.x1 = blk->xMax;
4994 child_selection.x2 = blk->xMin;
4996 child_selection.y1 = blk->yMin;
4997 child_selection.y2 = blk->yMax;
4998 if (blk == best_block[start]) {
4999 child_selection.x1 = fmax(blk->xMin, fmin(blk->xMax, x[start]));
5000 child_selection.y1 = fmax(blk->yMin, fmin(blk->yMax, y[start]));
5002 if (blk == best_block[stop]) {
5003 child_selection.x2 = fmax(blk->xMin, fmin(blk->xMax, x[stop]));
5004 child_selection.y2 = fmax(blk->yMin, fmin(blk->yMax, y[stop]));
5005 blk->visitSelection(visitor, &child_selection, style);
5006 return;
5008 blk->visitSelection(visitor, &child_selection, style);
5013 void TextPage::drawSelection(OutputDev *out,
5014 double scale,
5015 int rotation,
5016 PDFRectangle *selection,
5017 SelectionStyle style,
5018 GfxColor *glyph_color, GfxColor *box_color)
5020 TextSelectionPainter painter(this, scale, rotation,
5021 out, box_color, glyph_color);
5023 visitSelection(&painter, selection, style);
5024 painter.endPage();
5027 GooList *TextPage::getSelectionRegion(PDFRectangle *selection,
5028 SelectionStyle style,
5029 double scale) {
5030 TextSelectionSizer sizer(this, scale);
5032 visitSelection(&sizer, selection, style);
5034 return sizer.getRegion();
5037 GooString *TextPage::getSelectionText(PDFRectangle *selection,
5038 SelectionStyle style)
5040 TextSelectionDumper dumper(this);
5042 visitSelection(&dumper, selection, style);
5043 dumper.endPage();
5045 return dumper.getText();
5048 GooList **TextPage::getSelectionWords(PDFRectangle *selection,
5049 SelectionStyle style,
5050 int *nLines)
5052 TextSelectionDumper dumper(this);
5054 visitSelection(&dumper, selection, style);
5055 dumper.endPage();
5057 return dumper.takeWordList(nLines);
5060 GBool TextPage::findCharRange(int pos, int length,
5061 double *xMin, double *yMin,
5062 double *xMax, double *yMax) {
5063 TextBlock *blk;
5064 TextLine *line;
5065 TextWord *word;
5066 double xMin0, xMax0, yMin0, yMax0;
5067 double xMin1, xMax1, yMin1, yMax1;
5068 GBool first;
5069 int i, j0, j1;
5071 if (rawOrder) {
5072 return gFalse;
5075 //~ this doesn't correctly handle ranges split across multiple lines
5076 //~ (the highlighted region is the bounding box of all the parts of
5077 //~ the range)
5078 first = gTrue;
5079 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
5080 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
5081 for (i = 0; i < nBlocks; ++i) {
5082 blk = blocks[i];
5083 for (line = blk->lines; line; line = line->next) {
5084 for (word = line->words; word; word = word->next) {
5085 if (pos < word->charPos[word->len] &&
5086 pos + length > word->charPos[0]) {
5087 for (j0 = 0;
5088 j0 < word->len && pos >= word->charPos[j0 + 1];
5089 ++j0) ;
5090 for (j1 = word->len - 1;
5091 j1 > j0 && pos + length <= word->charPos[j1];
5092 --j1) ;
5093 switch (line->rot) {
5094 case 0:
5095 xMin1 = word->edge[j0];
5096 xMax1 = word->edge[j1 + 1];
5097 yMin1 = word->yMin;
5098 yMax1 = word->yMax;
5099 break;
5100 case 1:
5101 xMin1 = word->xMin;
5102 xMax1 = word->xMax;
5103 yMin1 = word->edge[j0];
5104 yMax1 = word->edge[j1 + 1];
5105 break;
5106 case 2:
5107 xMin1 = word->edge[j1 + 1];
5108 xMax1 = word->edge[j0];
5109 yMin1 = word->yMin;
5110 yMax1 = word->yMax;
5111 break;
5112 case 3:
5113 xMin1 = word->xMin;
5114 xMax1 = word->xMax;
5115 yMin1 = word->edge[j1 + 1];
5116 yMax1 = word->edge[j0];
5117 break;
5119 if (first || xMin1 < xMin0) {
5120 xMin0 = xMin1;
5122 if (first || xMax1 > xMax0) {
5123 xMax0 = xMax1;
5125 if (first || yMin1 < yMin0) {
5126 yMin0 = yMin1;
5128 if (first || yMax1 > yMax0) {
5129 yMax0 = yMax1;
5131 first = gFalse;
5136 if (!first) {
5137 *xMin = xMin0;
5138 *xMax = xMax0;
5139 *yMin = yMin0;
5140 *yMax = yMax0;
5141 return gTrue;
5143 return gFalse;
5146 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
5147 GBool physLayout) {
5148 UnicodeMap *uMap;
5149 TextFlow *flow;
5150 TextBlock *blk;
5151 TextLine *line;
5152 TextLineFrag *frags;
5153 TextWord *word;
5154 int nFrags, fragsSize;
5155 TextLineFrag *frag;
5156 char space[8], eol[16], eop[8];
5157 int spaceLen, eolLen, eopLen;
5158 GBool pageBreaks;
5159 GooString *s;
5160 double delta;
5161 int col, i, j, d, n;
5163 // get the output encoding
5164 if (!(uMap = globalParams->getTextEncoding())) {
5165 return;
5167 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
5168 eolLen = 0; // make gcc happy
5169 switch (globalParams->getTextEOL()) {
5170 case eolUnix:
5171 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
5172 break;
5173 case eolDOS:
5174 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
5175 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
5176 break;
5177 case eolMac:
5178 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
5179 break;
5181 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
5182 pageBreaks = globalParams->getTextPageBreaks();
5184 //~ writing mode (horiz/vert)
5186 // output the page in raw (content stream) order
5187 if (rawOrder) {
5189 for (word = rawWords; word; word = word->next) {
5190 s = new GooString();
5191 dumpFragment(word->text, word->len, uMap, s);
5192 (*outputFunc)(outputStream, s->getCString(), s->getLength());
5193 delete s;
5194 if (word->next &&
5195 fabs(word->next->base - word->base) <
5196 maxIntraLineDelta * word->fontSize &&
5197 word->next->xMin >
5198 word->xMax - minDupBreakOverlap * word->fontSize) {
5199 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
5200 (*outputFunc)(outputStream, space, spaceLen);
5202 } else {
5203 (*outputFunc)(outputStream, eol, eolLen);
5207 // output the page, maintaining the original physical layout
5208 } else if (physLayout) {
5210 // collect the line fragments for the page and sort them
5211 fragsSize = 256;
5212 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
5213 nFrags = 0;
5214 for (i = 0; i < nBlocks; ++i) {
5215 blk = blocks[i];
5216 for (line = blk->lines; line; line = line->next) {
5217 if (nFrags == fragsSize) {
5218 fragsSize *= 2;
5219 frags = (TextLineFrag *)greallocn(frags,
5220 fragsSize, sizeof(TextLineFrag));
5222 frags[nFrags].init(line, 0, line->len);
5223 frags[nFrags].computeCoords(gTrue);
5224 ++nFrags;
5227 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
5228 i = 0;
5229 while (i < nFrags) {
5230 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
5231 for (j = i+1;
5232 j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
5233 ++j) ;
5234 qsort(frags + i, j - i, sizeof(TextLineFrag),
5235 &TextLineFrag::cmpXYColumnPrimaryRot);
5236 i = j;
5239 #if 0 // for debugging
5240 printf("*** line fragments ***\n");
5241 for (i = 0; i < nFrags; ++i) {
5242 frag = &frags[i];
5243 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
5244 frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
5245 for (n = 0; n < frag->len; ++n) {
5246 fputc(frag->line->text[frag->start + n] & 0xff, stdout);
5248 printf("'\n");
5250 printf("\n");
5251 #endif
5253 // generate output
5254 col = 0;
5255 for (i = 0; i < nFrags; ++i) {
5256 frag = &frags[i];
5258 // column alignment
5259 for (; col < frag->col; ++col) {
5260 (*outputFunc)(outputStream, space, spaceLen);
5263 // print the line
5264 s = new GooString();
5265 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
5266 (*outputFunc)(outputStream, s->getCString(), s->getLength());
5267 delete s;
5269 // print one or more returns if necessary
5270 if (i == nFrags - 1 ||
5271 frags[i+1].col < col ||
5272 fabs(frags[i+1].base - frag->base) >
5273 maxIntraLineDelta * frag->line->words->fontSize) {
5274 if (i < nFrags - 1) {
5275 d = (int)((frags[i+1].base - frag->base) /
5276 frag->line->words->fontSize);
5277 if (d < 1) {
5278 d = 1;
5279 } else if (d > 5) {
5280 d = 5;
5282 } else {
5283 d = 1;
5285 for (; d > 0; --d) {
5286 (*outputFunc)(outputStream, eol, eolLen);
5288 col = 0;
5292 gfree(frags);
5294 // output the page, "undoing" the layout
5295 } else {
5296 for (flow = flows; flow; flow = flow->next) {
5297 for (blk = flow->blocks; blk; blk = blk->next) {
5298 for (line = blk->lines; line; line = line->next) {
5299 n = line->len;
5300 if (line->hyphenated && (line->next || blk->next)) {
5301 --n;
5303 s = new GooString();
5304 dumpFragment(line->text, n, uMap, s);
5305 (*outputFunc)(outputStream, s->getCString(), s->getLength());
5306 delete s;
5307 // output a newline when a hyphen is not suppressed
5308 if (n == line->len) {
5309 (*outputFunc)(outputStream, eol, eolLen);
5313 (*outputFunc)(outputStream, eol, eolLen);
5317 // end of page
5318 if (pageBreaks) {
5319 (*outputFunc)(outputStream, eop, eopLen);
5322 uMap->decRefCnt();
5325 void TextPage::setMergeCombining(GBool merge) {
5326 mergeCombining = merge;
5329 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
5330 TextLineFrag *frag0, *frag1;
5331 int rot, col1, col2, i, j, k;
5333 // all text in the region has the same rotation -- recompute the
5334 // column numbers based only on the text in the region
5335 if (oneRot) {
5336 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
5337 rot = frags[0].line->rot;
5338 for (i = 0; i < nFrags; ++i) {
5339 frag0 = &frags[i];
5340 col1 = 0;
5341 for (j = 0; j < i; ++j) {
5342 frag1 = &frags[j];
5343 col2 = 0; // make gcc happy
5344 switch (rot) {
5345 case 0:
5346 if (frag0->xMin >= frag1->xMax) {
5347 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
5348 frag1->line->col[frag1->start]) + 1;
5349 } else {
5350 for (k = frag1->start;
5351 k < frag1->start + frag1->len &&
5352 frag0->xMin >= 0.5 * (frag1->line->edge[k] +
5353 frag1->line->edge[k+1]);
5354 ++k) ;
5355 col2 = frag1->col +
5356 frag1->line->col[k] - frag1->line->col[frag1->start];
5358 break;
5359 case 1:
5360 if (frag0->yMin >= frag1->yMax) {
5361 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
5362 frag1->line->col[frag1->start]) + 1;
5363 } else {
5364 for (k = frag1->start;
5365 k < frag1->start + frag1->len &&
5366 frag0->yMin >= 0.5 * (frag1->line->edge[k] +
5367 frag1->line->edge[k+1]);
5368 ++k) ;
5369 col2 = frag1->col +
5370 frag1->line->col[k] - frag1->line->col[frag1->start];
5372 break;
5373 case 2:
5374 if (frag0->xMax <= frag1->xMin) {
5375 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
5376 frag1->line->col[frag1->start]) + 1;
5377 } else {
5378 for (k = frag1->start;
5379 k < frag1->start + frag1->len &&
5380 frag0->xMax <= 0.5 * (frag1->line->edge[k] +
5381 frag1->line->edge[k+1]);
5382 ++k) ;
5383 col2 = frag1->col +
5384 frag1->line->col[k] - frag1->line->col[frag1->start];
5386 break;
5387 case 3:
5388 if (frag0->yMax <= frag1->yMin) {
5389 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
5390 frag1->line->col[frag1->start]) + 1;
5391 } else {
5392 for (k = frag1->start;
5393 k < frag1->start + frag1->len &&
5394 frag0->yMax <= 0.5 * (frag1->line->edge[k] +
5395 frag1->line->edge[k+1]);
5396 ++k) ;
5397 col2 = frag1->col +
5398 frag1->line->col[k] - frag1->line->col[frag1->start];
5400 break;
5402 if (col2 > col1) {
5403 col1 = col2;
5406 frag0->col = col1;
5409 // the region includes text at different rotations -- use the
5410 // globally assigned column numbers, offset by the minimum column
5411 // number (i.e., shift everything over to column 0)
5412 } else {
5413 col1 = frags[0].col;
5414 for (i = 1; i < nFrags; ++i) {
5415 if (frags[i].col < col1) {
5416 col1 = frags[i].col;
5419 for (i = 0; i < nFrags; ++i) {
5420 frags[i].col -= col1;
5425 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
5426 GooString *s) {
5427 if (uMap->isUnicode()) {
5428 return reorderText(text, len, uMap, primaryLR, s, NULL);
5429 } else {
5430 int nCols = 0;
5432 char buf[8];
5433 int buflen = 0;
5435 for (int i = 0; i < len; ++i) {
5436 buflen = uMap->mapUnicode(text[i], buf, sizeof(buf));
5437 s->append(buf, buflen);
5438 nCols += buflen;
5441 return nCols;
5445 #if TEXTOUT_WORD_LIST
5446 TextWordList *TextPage::makeWordList(GBool physLayout) {
5447 return new TextWordList(this, physLayout);
5449 #endif
5451 //------------------------------------------------------------------------
5452 // ActualText
5453 //------------------------------------------------------------------------
5454 ActualText::ActualText(TextPage *out) {
5455 out->incRefCnt();
5456 text = out;
5457 actualText = NULL;
5458 actualTextNBytes = 0;
5461 ActualText::~ActualText() {
5462 if (actualText)
5463 delete actualText;
5464 text->decRefCnt();
5467 void ActualText::addChar(GfxState *state, double x, double y,
5468 double dx, double dy,
5469 CharCode c, int nBytes, Unicode *u, int uLen) {
5470 if (!actualText) {
5471 text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5472 return;
5475 // Inside ActualText span.
5476 if (!actualTextNBytes) {
5477 actualTextX0 = x;
5478 actualTextY0 = y;
5480 actualTextX1 = x + dx;
5481 actualTextY1 = y + dy;
5482 actualTextNBytes += nBytes;
5485 void ActualText::begin(GfxState *state, GooString *text) {
5486 if (actualText)
5487 delete actualText;
5488 actualText = new GooString(text);
5489 actualTextNBytes = 0;
5492 void ActualText::end(GfxState *state) {
5493 // ActualText span closed. Output the span text and the
5494 // extents of all the glyphs inside the span
5496 if (actualTextNBytes) {
5497 Unicode *uni = NULL;
5498 int length;
5500 // now that we have the position info for all of the text inside
5501 // the marked content span, we feed the "ActualText" back through
5502 // text->addChar()
5503 length = TextStringToUCS4(actualText, &uni);
5504 text->addChar(state, actualTextX0, actualTextY0,
5505 actualTextX1 - actualTextX0, actualTextY1 - actualTextY0,
5506 0, actualTextNBytes, uni, length);
5507 gfree(uni);
5510 delete actualText;
5511 actualText = NULL;
5512 actualTextNBytes = 0;
5515 //------------------------------------------------------------------------
5516 // TextOutputDev
5517 //------------------------------------------------------------------------
5519 static void TextOutputDev_outputToFile(void *stream, const char *text, int len) {
5520 fwrite(text, 1, len, (FILE *)stream);
5523 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
5524 double fixedPitchA, GBool rawOrderA,
5525 GBool append) {
5526 text = NULL;
5527 physLayout = physLayoutA;
5528 fixedPitch = physLayout ? fixedPitchA : 0;
5529 rawOrder = rawOrderA;
5530 doHTML = gFalse;
5531 ok = gTrue;
5533 // open file
5534 needClose = gFalse;
5535 if (fileName) {
5536 if (!strcmp(fileName, "-")) {
5537 outputStream = stdout;
5538 #ifdef _WIN32
5539 // keep DOS from munging the end-of-line characters
5540 setmode(fileno(stdout), O_BINARY);
5541 #endif
5542 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
5543 needClose = gTrue;
5544 } else {
5545 error(errIO, -1, "Couldn't open text file '{0:s}'", fileName);
5546 ok = gFalse;
5547 actualText = NULL;
5548 return;
5550 outputFunc = &TextOutputDev_outputToFile;
5551 } else {
5552 outputStream = NULL;
5555 // set up text object
5556 text = new TextPage(rawOrderA);
5557 actualText = new ActualText(text);
5560 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
5561 GBool physLayoutA, double fixedPitchA,
5562 GBool rawOrderA) {
5563 outputFunc = func;
5564 outputStream = stream;
5565 needClose = gFalse;
5566 physLayout = physLayoutA;
5567 fixedPitch = physLayout ? fixedPitchA : 0;
5568 rawOrder = rawOrderA;
5569 doHTML = gFalse;
5570 text = new TextPage(rawOrderA);
5571 actualText = new ActualText(text);
5572 ok = gTrue;
5575 TextOutputDev::~TextOutputDev() {
5576 if (needClose) {
5577 #ifdef MACOS
5578 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
5579 #endif
5580 fclose((FILE *)outputStream);
5582 if (text) {
5583 text->decRefCnt();
5585 delete actualText;
5588 void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) {
5589 text->startPage(state);
5592 void TextOutputDev::endPage() {
5593 text->endPage();
5594 text->coalesce(physLayout, fixedPitch, doHTML);
5595 if (outputStream) {
5596 text->dump(outputStream, outputFunc, physLayout);
5600 void TextOutputDev::restoreState(GfxState *state) {
5601 text->updateFont(state);
5604 void TextOutputDev::updateFont(GfxState *state) {
5605 text->updateFont(state);
5608 void TextOutputDev::beginString(GfxState *state, GooString *s) {
5611 void TextOutputDev::endString(GfxState *state) {
5614 void TextOutputDev::drawChar(GfxState *state, double x, double y,
5615 double dx, double dy,
5616 double originX, double originY,
5617 CharCode c, int nBytes, Unicode *u, int uLen) {
5618 actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5621 void TextOutputDev::incCharCount(int nChars) {
5622 text->incCharCount(nChars);
5625 void TextOutputDev::beginActualText(GfxState *state, GooString *text)
5627 actualText->begin(state, text);
5630 void TextOutputDev::endActualText(GfxState *state)
5632 actualText->end(state);
5635 void TextOutputDev::stroke(GfxState *state) {
5636 GfxPath *path;
5637 GfxSubpath *subpath;
5638 double x[2], y[2];
5640 if (!doHTML) {
5641 return;
5643 path = state->getPath();
5644 if (path->getNumSubpaths() != 1) {
5645 return;
5647 subpath = path->getSubpath(0);
5648 if (subpath->getNumPoints() != 2) {
5649 return;
5651 state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
5652 state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
5654 // look for a vertical or horizontal line
5655 if (x[0] == x[1] || y[0] == y[1]) {
5656 text->addUnderline(x[0], y[0], x[1], y[1]);
5660 void TextOutputDev::fill(GfxState *state) {
5661 GfxPath *path;
5662 GfxSubpath *subpath;
5663 double x[5], y[5];
5664 double rx0, ry0, rx1, ry1, t;
5665 int i;
5667 if (!doHTML) {
5668 return;
5670 path = state->getPath();
5671 if (path->getNumSubpaths() != 1) {
5672 return;
5674 subpath = path->getSubpath(0);
5675 if (subpath->getNumPoints() != 5) {
5676 return;
5678 for (i = 0; i < 5; ++i) {
5679 if (subpath->getCurve(i)) {
5680 return;
5682 state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
5685 // look for a rectangle
5686 if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
5687 x[0] == x[4] && y[0] == y[4]) {
5688 rx0 = x[0];
5689 ry0 = y[0];
5690 rx1 = x[2];
5691 ry1 = y[1];
5692 } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
5693 x[0] == x[4] && y[0] == y[4]) {
5694 rx0 = x[0];
5695 ry0 = y[0];
5696 rx1 = x[1];
5697 ry1 = y[2];
5698 } else {
5699 return;
5701 if (rx1 < rx0) {
5702 t = rx0;
5703 rx0 = rx1;
5704 rx1 = t;
5706 if (ry1 < ry0) {
5707 t = ry0;
5708 ry0 = ry1;
5709 ry1 = t;
5712 // skinny horizontal rectangle
5713 if (ry1 - ry0 < rx1 - rx0) {
5714 if (ry1 - ry0 < maxUnderlineWidth) {
5715 ry0 = 0.5 * (ry0 + ry1);
5716 text->addUnderline(rx0, ry0, rx1, ry0);
5719 // skinny vertical rectangle
5720 } else {
5721 if (rx1 - rx0 < maxUnderlineWidth) {
5722 rx0 = 0.5 * (rx0 + rx1);
5723 text->addUnderline(rx0, ry0, rx0, ry1);
5728 void TextOutputDev::eoFill(GfxState *state) {
5729 if (!doHTML) {
5730 return;
5732 fill(state);
5735 void TextOutputDev::processLink(AnnotLink *link) {
5736 double x1, y1, x2, y2;
5737 int xMin, yMin, xMax, yMax, x, y;
5739 if (!doHTML) {
5740 return;
5742 link->getRect(&x1, &y1, &x2, &y2);
5743 cvtUserToDev(x1, y1, &x, &y);
5744 xMin = xMax = x;
5745 yMin = yMax = y;
5746 cvtUserToDev(x1, y2, &x, &y);
5747 if (x < xMin) {
5748 xMin = x;
5749 } else if (x > xMax) {
5750 xMax = x;
5752 if (y < yMin) {
5753 yMin = y;
5754 } else if (y > yMax) {
5755 yMax = y;
5757 cvtUserToDev(x2, y1, &x, &y);
5758 if (x < xMin) {
5759 xMin = x;
5760 } else if (x > xMax) {
5761 xMax = x;
5763 if (y < yMin) {
5764 yMin = y;
5765 } else if (y > yMax) {
5766 yMax = y;
5768 cvtUserToDev(x2, y2, &x, &y);
5769 if (x < xMin) {
5770 xMin = x;
5771 } else if (x > xMax) {
5772 xMax = x;
5774 if (y < yMin) {
5775 yMin = y;
5776 } else if (y > yMax) {
5777 yMax = y;
5779 text->addLink(xMin, yMin, xMax, yMax, link);
5782 GBool TextOutputDev::findText(Unicode *s, int len,
5783 GBool startAtTop, GBool stopAtBottom,
5784 GBool startAtLast, GBool stopAtLast,
5785 GBool caseSensitive, GBool backward,
5786 GBool wholeWord,
5787 double *xMin, double *yMin,
5788 double *xMax, double *yMax) {
5789 return text->findText(s, len, startAtTop, stopAtBottom,
5790 startAtLast, stopAtLast,
5791 caseSensitive, backward, wholeWord,
5792 xMin, yMin, xMax, yMax);
5795 GooString *TextOutputDev::getText(double xMin, double yMin,
5796 double xMax, double yMax) {
5797 return text->getText(xMin, yMin, xMax, yMax);
5800 void TextOutputDev::drawSelection(OutputDev *out,
5801 double scale,
5802 int rotation,
5803 PDFRectangle *selection,
5804 SelectionStyle style,
5805 GfxColor *glyph_color, GfxColor *box_color) {
5806 text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color);
5809 GooList *TextOutputDev::getSelectionRegion(PDFRectangle *selection,
5810 SelectionStyle style,
5811 double scale) {
5812 return text->getSelectionRegion(selection, style, scale);
5815 GooString *TextOutputDev::getSelectionText(PDFRectangle *selection,
5816 SelectionStyle style)
5818 return text->getSelectionText(selection, style);
5821 GBool TextOutputDev::findCharRange(int pos, int length,
5822 double *xMin, double *yMin,
5823 double *xMax, double *yMax) {
5824 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
5827 void TextOutputDev::setMergeCombining(GBool merge) {
5828 text->setMergeCombining(merge);
5831 #if TEXTOUT_WORD_LIST
5832 TextWordList *TextOutputDev::makeWordList() {
5833 return text->makeWordList(physLayout);
5835 #endif
5837 TextPage *TextOutputDev::takeText() {
5838 TextPage *ret;
5840 ret = text;
5841 text = new TextPage(rawOrder);
5842 return ret;