1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
9 //========================================================================
11 // Modified under the Poppler project - http://poppler.freedesktop.org
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
16 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17 // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18 // Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20 // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21 // Copyright (C) 2007, 2008, 2012 Adrian Johnson <ajohnson@redneon.com>
22 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23 // Copyright (C) 2008, 2010-2012, 2014, 2015 Albert Astals Cid <aacid@kde.org>
24 // Copyright (C) 2008 Pino Toscano <pino@kde.org>
25 // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26 // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29 // Copyright (C) 2010 Marek Kasik <mkasik@redhat.com>
30 // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31 // Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32 // Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33 // Copyright (C) 2012, 2013-2015 Jason Crain <jason@aquaticape.us>
34 // Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35 // Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37 // Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38 // Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
40 // To see a description of the changes please see the Changelog file that
41 // came with your tarball or type make ChangeLog if you are building from git
43 //========================================================================
47 #ifdef USE_GCC_PRAGMAS
48 #pragma implementation
58 #include <fcntl.h> // for O_BINARY
59 #include <io.h> // for setmode
62 #include "goo/GooString.h"
63 #include "goo/GooList.h"
64 #include "poppler-config.h"
66 #include "GlobalParams.h"
67 #include "UnicodeMap.h"
68 #include "UnicodeTypeTable.h"
70 #include "TextOutputDev.h"
76 // needed for setting type/creator of MacOS files
77 #include "ICSupport.h"
80 //------------------------------------------------------------------------
82 //------------------------------------------------------------------------
84 // Each bucket in a text pool includes baselines within a range of
86 #define textPoolStep 4
88 // Inter-character space width which will cause addChar to start a new
90 #define minWordBreakSpace 0.1
92 // Negative inter-character space width, i.e., overlap, which will
93 // cause addChar to start a new word.
94 #define minDupBreakOverlap 0.2
96 // Max distance between baselines of two lines within a block, as a
97 // fraction of the font size.
98 #define maxLineSpacingDelta 1.5
100 // Max difference in primary font sizes on two lines in the same
101 // block. Delta1 is used when examining new lines above and below the
102 // current block; delta2 is used when examining text that overlaps the
103 // current block; delta3 is used when examining text to the left and
104 // right of the current block.
105 #define maxBlockFontSizeDelta1 0.05
106 #define maxBlockFontSizeDelta2 0.6
107 #define maxBlockFontSizeDelta3 0.2
109 // Max difference in font sizes inside a word.
110 #define maxWordFontSizeDelta 0.05
112 // Maximum distance between baselines of two words on the same line,
113 // e.g., distance between subscript or superscript and the primary
114 // baseline, as a fraction of the font size.
115 #define maxIntraLineDelta 0.5
117 // Minimum inter-word spacing, as a fraction of the font size. (Only
118 // used for raw ordering.)
119 #define minWordSpacing 0.15
121 // Maximum inter-word spacing, as a fraction of the font size.
122 #define maxWordSpacing 1.5
124 // Maximum horizontal spacing which will allow a word to be pulled
126 #define minColSpacing1 0.3
128 // Minimum spacing between columns, as a fraction of the font size.
129 #define minColSpacing2 1.0
131 // Maximum vertical spacing between blocks within a flow, as a
132 // multiple of the font size.
133 #define maxBlockSpacing 2.5
135 // Minimum spacing between characters within a word, as a fraction of
137 #define minCharSpacing -0.5
139 // Maximum spacing between characters within a word, as a fraction of
140 // the font size, when there is no obvious extra-wide character
142 #define maxCharSpacing 0.03
144 // When extra-wide character spacing is detected, the inter-character
145 // space threshold is set to the minimum inter-character space
146 // multiplied by this constant.
147 #define maxWideCharSpacingMul 1.3
149 // Upper limit on spacing between characters in a word.
150 #define maxWideCharSpacing 0.4
152 // Max difference in primary,secondary coordinates (as a fraction of
153 // the font size) allowed for duplicated text (fake boldface, drop
154 // shadows) which is to be discarded.
155 #define dupMaxPriDelta 0.1
156 #define dupMaxSecDelta 0.2
158 // Max width of underlines (in points).
159 #define maxUnderlineWidth 3
161 // Min distance between baseline and underline (in points).
162 //~ this should be font-size-dependent
163 #define minUnderlineGap -2
165 // Max distance between baseline and underline (in points).
166 //~ this should be font-size-dependent
167 #define maxUnderlineGap 4
169 // Max horizontal distance between edge of word and start of underline
171 //~ this should be font-size-dependent
172 #define underlineSlack 1
174 // Max distance between edge of text and edge of link border
175 #define hyperlinkSlack 2
177 // Max distance between characters when combining a base character and
178 // combining character
179 #define combMaxMidDelta 0.3
180 #define combMaxBaseDelta 0.4
182 static int reorderText(Unicode
*text
, int len
, UnicodeMap
*uMap
, GBool primaryLR
, GooString
*s
, Unicode
* u
) {
183 char lre
[8], rle
[8], popdf
[8], buf
[8];
184 int lreLen
= 0, rleLen
= 0, popdfLen
= 0, n
;
190 lreLen
= uMap
->mapUnicode(0x202a, lre
, sizeof(lre
));
191 rleLen
= uMap
->mapUnicode(0x202b, rle
, sizeof(rle
));
192 popdfLen
= uMap
->mapUnicode(0x202c, popdf
, sizeof(popdf
));
198 // output a left-to-right section
199 for (j
= i
; j
< len
&& !unicodeTypeR(text
[j
]); ++j
) ;
200 for (k
= i
; k
< j
; ++k
) {
202 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
205 if (u
) u
[nCols
] = text
[k
];
209 // output a right-to-left section
211 j
< len
&& !(unicodeTypeL(text
[j
]) || unicodeTypeNum(text
[j
]));
214 if (s
) s
->append(rle
, rleLen
);
215 for (k
= j
- 1; k
>= i
; --k
) {
217 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
220 if (u
) u
[nCols
] = text
[k
];
223 if (s
) s
->append(popdf
, popdfLen
);
228 // Note: This code treats numeric characters (European and
229 // Arabic/Indic) as left-to-right, which isn't strictly correct
230 // (incurs extra LRE/POPDF pairs), but does produce correct
231 // visual formatting.
232 if (s
) s
->append(rle
, rleLen
);
235 // output a right-to-left section
237 j
>= 0 && !(unicodeTypeL(text
[j
]) || unicodeTypeNum(text
[j
]));
239 for (k
= i
; k
> j
; --k
) {
241 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
244 if (u
) u
[nCols
] = text
[k
];
248 // output a left-to-right section
249 for (j
= i
; j
>= 0 && !unicodeTypeR(text
[j
]); --j
) ;
251 if (s
) s
->append(lre
, lreLen
);
252 for (k
= j
+ 1; k
<= i
; ++k
) {
254 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
257 if (u
) u
[nCols
] = text
[k
];
260 if (s
) s
->append(popdf
, popdfLen
);
264 if (s
) s
->append(popdf
, popdfLen
);
270 //------------------------------------------------------------------------
272 //------------------------------------------------------------------------
274 class TextUnderline
{
277 TextUnderline(double x0A
, double y0A
, double x1A
, double y1A
)
278 { x0
= x0A
; y0
= y0A
; x1
= x1A
; y1
= y1A
; horiz
= y0
== y1
; }
281 double x0
, y0
, x1
, y1
;
285 //------------------------------------------------------------------------
287 //------------------------------------------------------------------------
292 TextLink(int xMinA
, int yMinA
, int xMaxA
, int yMaxA
, AnnotLink
*linkA
)
293 { xMin
= xMinA
; yMin
= yMinA
; xMax
= xMaxA
; yMax
= yMaxA
; link
= linkA
; }
296 int xMin
, yMin
, xMax
, yMax
;
300 //------------------------------------------------------------------------
302 //------------------------------------------------------------------------
304 TextFontInfo::TextFontInfo(GfxState
*state
) {
305 gfxFont
= state
->getFont();
307 gfxFont
->incRefCnt();
308 #if TEXTOUT_WORD_LIST
309 fontName
= (gfxFont
&& gfxFont
->getName()) ? gfxFont
->getName()->copy()
311 flags
= gfxFont
? gfxFont
->getFlags() : 0;
315 TextFontInfo::~TextFontInfo() {
317 gfxFont
->decRefCnt();
318 #if TEXTOUT_WORD_LIST
325 GBool
TextFontInfo::matches(GfxState
*state
) {
326 return state
->getFont() == gfxFont
;
329 GBool
TextFontInfo::matches(TextFontInfo
*fontInfo
) {
330 return gfxFont
== fontInfo
->gfxFont
;
333 double TextFontInfo::getAscent() {
334 return gfxFont
? gfxFont
->getAscent() : 0.95;
337 double TextFontInfo::getDescent() {
338 return gfxFont
? gfxFont
->getDescent() : -0.35;
341 int TextFontInfo::getWMode() {
342 return gfxFont
? gfxFont
->getWMode() : 0;
345 //------------------------------------------------------------------------
347 //------------------------------------------------------------------------
349 TextWord::TextWord(GfxState
*state
, int rotA
, double fontSizeA
) {
351 fontSize
= fontSizeA
;
362 #if TEXTOUT_WORD_LIST
365 if ((state
->getRender() & 3) == 1) {
366 state
->getStrokeRGB(&rgb
);
368 state
->getFillRGB(&rgb
);
370 colorR
= colToDbl(rgb
.r
);
371 colorG
= colToDbl(rgb
.g
);
372 colorB
= colToDbl(rgb
.b
);
379 TextWord::~TextWord() {
388 void TextWord::addChar(GfxState
*state
, TextFontInfo
*fontA
, double x
, double y
,
389 double dx
, double dy
, int charPosA
, int charLen
,
390 CharCode c
, Unicode u
, const Matrix
&textMatA
) {
391 ensureCapacity(len
+1);
394 charPos
[len
] = charPosA
;
395 charPos
[len
+ 1] = charPosA
+ charLen
;
397 textMat
[len
] = textMatA
;
400 setInitialBounds(fontA
, x
, y
);
402 if (wMode
) { // vertical writing mode
403 // NB: the rotation value has been incremented by 1 (in
404 // TextPage::beginWord()) for vertical writing mode
407 edge
[len
] = x
- fontSize
;
408 xMax
= edge
[len
+1] = x
;
411 edge
[len
] = y
- fontSize
;
412 yMax
= edge
[len
+1] = y
;
415 edge
[len
] = x
+ fontSize
;
416 xMin
= edge
[len
+1] = x
;
419 edge
[len
] = y
+ fontSize
;
420 yMin
= edge
[len
+1] = y
;
423 } else { // horizontal writing mode
427 xMax
= edge
[len
+1] = x
+ dx
;
431 yMax
= edge
[len
+1] = y
+ dy
;
435 xMin
= edge
[len
+1] = x
+ dx
;
439 yMin
= edge
[len
+1] = y
+ dy
;
446 void TextWord::setInitialBounds(TextFontInfo
*fontA
, double x
, double y
) {
447 double ascent
= fontA
->getAscent() * fontSize
;
448 double descent
= fontA
->getDescent() * fontSize
;
449 wMode
= fontA
->getWMode();
451 if (wMode
) { // vertical writing mode
452 // NB: the rotation value has been incremented by 1 (in
453 // TextPage::beginWord()) for vertical writing mode
480 } else { // horizontal writing mode
487 // this is a sanity check for a case that shouldn't happen -- but
488 // if it does happen, we want to avoid dividing by zero later
499 // this is a sanity check for a case that shouldn't happen -- but
500 // if it does happen, we want to avoid dividing by zero later
511 // this is a sanity check for a case that shouldn't happen -- but
512 // if it does happen, we want to avoid dividing by zero later
523 // this is a sanity check for a case that shouldn't happen -- but
524 // if it does happen, we want to avoid dividing by zero later
534 void TextWord::ensureCapacity(int capacity
) {
535 if (capacity
> size
) {
536 size
= std::max(size
+ 16, capacity
);
537 text
= (Unicode
*)greallocn(text
, size
, sizeof(Unicode
));
538 charcode
= (CharCode
*)greallocn(charcode
, (size
+ 1), sizeof(CharCode
));
539 edge
= (double *)greallocn(edge
, (size
+ 1), sizeof(double));
540 charPos
= (int *)greallocn(charPos
, size
+ 1, sizeof(int));
541 font
= (TextFontInfo
**)greallocn(font
, size
, sizeof(TextFontInfo
*));
542 textMat
= (Matrix
*)greallocn(textMat
, size
, sizeof(Matrix
));
546 struct CombiningTable
{
551 static struct CombiningTable combiningTable
[] = {
552 {0x0060, 0x0300}, // grave
553 {0x00a8, 0x0308}, // dieresis
554 {0x00af, 0x0304}, // macron
555 {0x00b4, 0x0301}, // acute
556 {0x00b8, 0x0327}, // cedilla
557 {0x02c6, 0x0302}, // circumflex
558 {0x02c7, 0x030c}, // caron
559 {0x02d8, 0x0306}, // breve
560 {0x02d9, 0x0307}, // dotaccent
561 {0x02da, 0x030a}, // ring
562 {0x02dc, 0x0303}, // tilde
563 {0x02dd, 0x030b} // hungarumlaut (double acute accent)
566 // returning combining versions of characters
567 Unicode
getCombiningChar(Unicode u
) {
568 int len
= sizeof(combiningTable
) / sizeof(combiningTable
[0]);
569 for (int i
= 0; i
< len
; ++i
) {
570 if (u
== combiningTable
[i
].base
)
571 return combiningTable
[i
].comb
;
576 GBool
TextWord::addCombining(GfxState
*state
, TextFontInfo
*fontA
, double fontSizeA
, double x
, double y
,
577 double dx
, double dy
, int charPosA
, int charLen
,
578 CharCode c
, Unicode u
, const Matrix
&textMatA
) {
579 if (len
== 0 || wMode
!= 0 || fontA
->getWMode() != 0)
582 Unicode cCurrent
= getCombiningChar(u
);
583 Unicode cPrev
= getCombiningChar(text
[len
-1]);
584 double edgeMid
= (edge
[len
-1] + edge
[len
]) / 2;
585 double charMid
, maxScaledMidDelta
, charBase
, maxScaledBaseDelta
;
587 if (cCurrent
!= 0 && unicodeTypeAlphaNum(text
[len
-1])) {
588 // Current is a combining character, previous is base character
589 maxScaledMidDelta
= fabs(edge
[len
] - edge
[len
-1]) * combMaxMidDelta
;
590 charMid
= charBase
= maxScaledBaseDelta
= 0;
592 // Test if characters overlap
593 if (rot
== 0 || rot
== 2) {
594 charMid
= x
+ (dx
/ 2);
596 maxScaledBaseDelta
= (yMax
- yMin
) * combMaxBaseDelta
;
598 charMid
= y
+ (dy
/ 2);
600 maxScaledBaseDelta
= (xMax
- xMin
) * combMaxBaseDelta
;
603 if (fabs(charMid
- edgeMid
) >= maxScaledMidDelta
||
604 fabs(charBase
- base
) >= maxScaledBaseDelta
)
607 // Add character, but don't adjust edge / bounding box because
608 // combining character's positioning could be odd.
609 ensureCapacity(len
+1);
610 text
[len
] = cCurrent
;
612 charPos
[len
] = charPosA
;
613 charPos
[len
+1] = charPosA
+ charLen
;
615 textMat
[len
] = textMatA
;
616 edge
[len
+1] = edge
[len
];
617 edge
[len
] = (edge
[len
+1] + edge
[len
-1]) / 2;
622 if (cPrev
!= 0 && unicodeTypeAlphaNum(u
)) {
623 // Previous is a combining character, current is base character
624 maxScaledBaseDelta
= (fontA
->getAscent() - fontA
->getDescent()) * fontSizeA
* combMaxBaseDelta
;
625 charMid
= charBase
= maxScaledMidDelta
= 0;
627 // Test if characters overlap
628 if (rot
== 0 || rot
== 2) {
629 charMid
= x
+ (dx
/ 2);
631 maxScaledMidDelta
= fabs(dx
* combMaxMidDelta
);
633 charMid
= y
+ (dy
/ 2);
635 maxScaledMidDelta
= fabs(dy
* combMaxMidDelta
);
638 if (fabs(charMid
- edgeMid
) >= maxScaledMidDelta
||
639 fabs(charBase
- base
) >= maxScaledBaseDelta
)
642 // move combining character to after base character
643 ensureCapacity(len
+1);
644 fontSize
= fontSizeA
;
646 charcode
[len
] = charcode
[len
-1];
647 charPos
[len
] = charPosA
;
648 charPos
[len
+1] = charPosA
+ charLen
;
649 font
[len
] = font
[len
-1];
650 textMat
[len
] = textMat
[len
-1];
655 textMat
[len
-1] = textMatA
;
658 setInitialBounds(fontA
, x
, y
);
660 // Updated edges / bounding box because we changed the base
665 edge
[len
-1] = x
- fontSize
;
666 xMax
= edge
[len
+1] = x
;
669 edge
[len
-1] = y
- fontSize
;
670 yMax
= edge
[len
+1] = y
;
673 edge
[len
-1] = x
+ fontSize
;
674 xMin
= edge
[len
+1] = x
;
677 edge
[len
-1] = y
+ fontSize
;
678 yMin
= edge
[len
+1] = y
;
685 xMax
= edge
[len
+1] = x
+ dx
;
689 yMax
= edge
[len
+1] = y
+ dy
;
693 xMin
= edge
[len
+1] = x
+ dx
;
697 yMin
= edge
[len
+1] = y
+ dy
;
702 edge
[len
] = (edge
[len
+1] + edge
[len
-1]) / 2;
709 void TextWord::merge(TextWord
*word
) {
712 if (word
->xMin
< xMin
) {
715 if (word
->yMin
< yMin
) {
718 if (word
->xMax
> xMax
) {
721 if (word
->yMax
> yMax
) {
724 ensureCapacity(len
+ word
->len
);
725 for (i
= 0; i
< word
->len
; ++i
) {
726 text
[len
+ i
] = word
->text
[i
];
727 charcode
[len
+ i
] = word
->charcode
[i
];
728 edge
[len
+ i
] = word
->edge
[i
];
729 charPos
[len
+ i
] = word
->charPos
[i
];
730 font
[len
+ i
] = word
->font
[i
];
731 textMat
[len
+ i
] = word
->textMat
[i
];
733 edge
[len
+ word
->len
] = word
->edge
[word
->len
];
734 charPos
[len
+ word
->len
] = word
->charPos
[word
->len
];
738 inline int TextWord::primaryCmp(TextWord
*word
) {
741 cmp
= 0; // make gcc happy
744 cmp
= xMin
- word
->xMin
;
747 cmp
= yMin
- word
->yMin
;
750 cmp
= word
->xMax
- xMax
;
753 cmp
= word
->yMax
- yMax
;
756 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
759 double TextWord::primaryDelta(TextWord
*word
) {
762 delta
= 0; // make gcc happy
765 delta
= word
->xMin
- xMax
;
768 delta
= word
->yMin
- yMax
;
771 delta
= xMin
- word
->xMax
;
774 delta
= yMin
- word
->yMax
;
780 int TextWord::cmpYX(const void *p1
, const void *p2
) {
781 TextWord
*word1
= *(TextWord
**)p1
;
782 TextWord
*word2
= *(TextWord
**)p2
;
785 cmp
= word1
->yMin
- word2
->yMin
;
787 cmp
= word1
->xMin
- word2
->xMin
;
789 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
792 #if TEXTOUT_WORD_LIST
794 GooString
*TextWord::getText() {
801 if (!(uMap
= globalParams
->getTextEncoding())) {
804 for (i
= 0; i
< len
; ++i
) {
805 n
= uMap
->mapUnicode(text
[i
], buf
, sizeof(buf
));
812 void TextWord::getCharBBox(int charIdx
, double *xMinA
, double *yMinA
,
813 double *xMaxA
, double *yMaxA
) {
814 if (charIdx
< 0 || charIdx
>= len
) {
819 *xMinA
= edge
[charIdx
];
820 *xMaxA
= edge
[charIdx
+ 1];
827 *yMinA
= edge
[charIdx
];
828 *yMaxA
= edge
[charIdx
+ 1];
831 *xMinA
= edge
[charIdx
+ 1];
832 *xMaxA
= edge
[charIdx
];
839 *yMinA
= edge
[charIdx
+ 1];
840 *yMaxA
= edge
[charIdx
];
845 #endif // TEXTOUT_WORD_LIST
847 //------------------------------------------------------------------------
849 //------------------------------------------------------------------------
851 TextPool::TextPool() {
859 TextPool::~TextPool() {
861 TextWord
*word
, *word2
;
863 for (baseIdx
= minBaseIdx
; baseIdx
<= maxBaseIdx
; ++baseIdx
) {
864 for (word
= pool
[baseIdx
- minBaseIdx
]; word
; word
= word2
) {
872 int TextPool::getBaseIdx(double base
) {
875 baseIdx
= (int)(base
/ textPoolStep
);
876 if (baseIdx
< minBaseIdx
) {
879 if (baseIdx
> maxBaseIdx
) {
885 void TextPool::addWord(TextWord
*word
) {
887 int wordBaseIdx
, newMinBaseIdx
, newMaxBaseIdx
, baseIdx
;
890 // expand the array if needed
891 if (unlikely((word
->base
/ textPoolStep
) > INT_MAX
)) {
892 error(errSyntaxWarning
, -1, "word->base / textPoolStep > INT_MAX");
895 wordBaseIdx
= (int)(word
->base
/ textPoolStep
);
896 if (minBaseIdx
> maxBaseIdx
) {
897 minBaseIdx
= wordBaseIdx
- 128;
898 maxBaseIdx
= wordBaseIdx
+ 128;
899 pool
= (TextWord
**)gmallocn(maxBaseIdx
- minBaseIdx
+ 1,
901 for (baseIdx
= minBaseIdx
; baseIdx
<= maxBaseIdx
; ++baseIdx
) {
902 pool
[baseIdx
- minBaseIdx
] = NULL
;
904 } else if (wordBaseIdx
< minBaseIdx
) {
905 newMinBaseIdx
= wordBaseIdx
- 128;
906 newPool
= (TextWord
**)gmallocn(maxBaseIdx
- newMinBaseIdx
+ 1,
908 for (baseIdx
= newMinBaseIdx
; baseIdx
< minBaseIdx
; ++baseIdx
) {
909 newPool
[baseIdx
- newMinBaseIdx
] = NULL
;
911 memcpy(&newPool
[minBaseIdx
- newMinBaseIdx
], pool
,
912 (maxBaseIdx
- minBaseIdx
+ 1) * sizeof(TextWord
*));
915 minBaseIdx
= newMinBaseIdx
;
916 } else if (wordBaseIdx
> maxBaseIdx
) {
917 newMaxBaseIdx
= wordBaseIdx
+ 128;
918 pool
= (TextWord
**)greallocn(pool
, newMaxBaseIdx
- minBaseIdx
+ 1,
920 for (baseIdx
= maxBaseIdx
+ 1; baseIdx
<= newMaxBaseIdx
; ++baseIdx
) {
921 pool
[baseIdx
- minBaseIdx
] = NULL
;
923 maxBaseIdx
= newMaxBaseIdx
;
926 // insert the new word
927 if (cursor
&& wordBaseIdx
== cursorBaseIdx
&&
928 word
->primaryCmp(cursor
) >= 0) {
933 w1
= pool
[wordBaseIdx
- minBaseIdx
];
935 for (; w1
&& word
->primaryCmp(w1
) > 0; w0
= w1
, w1
= w1
->next
) ;
940 pool
[wordBaseIdx
- minBaseIdx
] = word
;
943 cursorBaseIdx
= wordBaseIdx
;
946 //------------------------------------------------------------------------
948 //------------------------------------------------------------------------
950 TextLine::TextLine(TextBlock
*blkA
, int rotA
, double baseA
) {
954 words
= lastWord
= NULL
;
966 normalized_idx
= NULL
;
969 TextLine::~TextLine() {
982 gfree(normalized_idx
);
986 void TextLine::addWord(TextWord
*word
) {
988 lastWord
->next
= word
;
1000 if (word
->xMin
< xMin
) {
1003 if (word
->xMax
> xMax
) {
1006 if (word
->yMin
< yMin
) {
1009 if (word
->yMax
> yMax
) {
1015 double TextLine::primaryDelta(TextLine
*line
) {
1018 delta
= 0; // make gcc happy
1021 delta
= line
->xMin
- xMax
;
1024 delta
= line
->yMin
- yMax
;
1027 delta
= xMin
- line
->xMax
;
1030 delta
= yMin
- line
->yMax
;
1036 int TextLine::primaryCmp(TextLine
*line
) {
1039 cmp
= 0; // make gcc happy
1042 cmp
= xMin
- line
->xMin
;
1045 cmp
= yMin
- line
->yMin
;
1048 cmp
= line
->xMax
- xMax
;
1051 cmp
= line
->yMax
- yMax
;
1054 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1057 int TextLine::secondaryCmp(TextLine
*line
) {
1060 cmp
= (rot
== 0 || rot
== 3) ? base
- line
->base
: line
->base
- base
;
1061 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1064 int TextLine::cmpYX(TextLine
*line
) {
1067 if ((cmp
= secondaryCmp(line
))) {
1070 return primaryCmp(line
);
1073 int TextLine::cmpXY(const void *p1
, const void *p2
) {
1074 TextLine
*line1
= *(TextLine
**)p1
;
1075 TextLine
*line2
= *(TextLine
**)p2
;
1078 if ((cmp
= line1
->primaryCmp(line2
))) {
1081 return line1
->secondaryCmp(line2
);
1084 void TextLine::coalesce(UnicodeMap
*uMap
) {
1085 TextWord
*word0
, *word1
;
1086 double space
, delta
, minSpace
;
1093 // compute the inter-word space threshold
1094 if (words
->len
> 1 || words
->next
->len
> 1) {
1097 minSpace
= words
->primaryDelta(words
->next
);
1098 for (word0
= words
->next
, word1
= word0
->next
;
1099 word1
&& minSpace
> 0;
1100 word0
= word1
, word1
= word0
->next
) {
1101 if (word1
->len
> 1) {
1104 delta
= word0
->primaryDelta(word1
);
1105 if (delta
< minSpace
) {
1110 if (minSpace
<= 0) {
1111 space
= maxCharSpacing
* words
->fontSize
;
1113 space
= maxWideCharSpacingMul
* minSpace
;
1114 if (space
> maxWideCharSpacing
* words
->fontSize
) {
1115 space
= maxWideCharSpacing
* words
->fontSize
;
1121 word1
= words
->next
;
1123 if (word0
->primaryDelta(word1
) >= space
) {
1124 word0
->spaceAfter
= gTrue
;
1126 word1
= word1
->next
;
1127 } else if (word0
->font
[word0
->len
- 1] == word1
->font
[0] &&
1128 word0
->underlined
== word1
->underlined
&&
1129 fabs(word0
->fontSize
- word1
->fontSize
) <
1130 maxWordFontSizeDelta
* words
->fontSize
&&
1131 word1
->charPos
[0] == word0
->charPos
[word0
->len
]) {
1132 word0
->merge(word1
);
1133 word0
->next
= word1
->next
;
1135 word1
= word0
->next
;
1138 word1
= word1
->next
;
1143 // build the line text
1144 isUnicode
= uMap
? uMap
->isUnicode() : gFalse
;
1146 for (word1
= words
; word1
; word1
= word1
->next
) {
1148 if (word1
->spaceAfter
) {
1152 text
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
1153 edge
= (double *)gmallocn(len
+ 1, sizeof(double));
1155 for (word1
= words
; word1
; word1
= word1
->next
) {
1156 for (j
= 0; j
< word1
->len
; ++j
) {
1157 text
[i
] = word1
->text
[j
];
1158 edge
[i
] = word1
->edge
[j
];
1161 edge
[i
] = word1
->edge
[word1
->len
];
1162 if (word1
->spaceAfter
) {
1163 text
[i
] = (Unicode
)0x0020;
1168 // compute convertedLen and set up the col array
1169 col
= (int *)gmallocn(len
+ 1, sizeof(int));
1171 for (i
= 0; i
< len
; ++i
) {
1172 col
[i
] = convertedLen
;
1176 convertedLen
+= uMap
->mapUnicode(text
[i
], buf
, sizeof(buf
));
1179 col
[len
] = convertedLen
;
1181 // check for hyphen at end of line
1182 //~ need to check for other chars used as hyphens
1183 hyphenated
= text
[len
- 1] == (Unicode
)'-';
1186 //------------------------------------------------------------------------
1188 //------------------------------------------------------------------------
1190 class TextLineFrag
{
1193 TextLine
*line
; // the line object
1194 int start
, len
; // offset and length of this fragment
1195 // (in Unicode chars)
1196 double xMin
, xMax
; // bounding box coordinates
1198 double base
; // baseline virtual coordinate
1199 int col
; // first column
1201 void init(TextLine
*lineA
, int startA
, int lenA
);
1202 void computeCoords(GBool oneRot
);
1204 static int cmpYXPrimaryRot(const void *p1
, const void *p2
);
1205 static int cmpYXLineRot(const void *p1
, const void *p2
);
1206 static int cmpXYLineRot(const void *p1
, const void *p2
);
1207 static int cmpXYColumnPrimaryRot(const void *p1
, const void *p2
);
1208 static int cmpXYColumnLineRot(const void *p1
, const void *p2
);
1211 void TextLineFrag::init(TextLine
*lineA
, int startA
, int lenA
) {
1215 col
= line
->col
[start
];
1218 void TextLineFrag::computeCoords(GBool oneRot
) {
1220 double d0
, d1
, d2
, d3
, d4
;
1224 switch (line
->rot
) {
1226 xMin
= line
->edge
[start
];
1227 xMax
= line
->edge
[start
+ len
];
1234 yMin
= line
->edge
[start
];
1235 yMax
= line
->edge
[start
+ len
];
1238 xMin
= line
->edge
[start
+ len
];
1239 xMax
= line
->edge
[start
];
1246 yMin
= line
->edge
[start
+ len
];
1247 yMax
= line
->edge
[start
];
1254 if (line
->rot
== 0 && line
->blk
->page
->primaryRot
== 0) {
1256 xMin
= line
->edge
[start
];
1257 xMax
= line
->edge
[start
+ len
];
1265 d0
= line
->edge
[start
];
1266 d1
= line
->edge
[start
+ len
];
1267 d2
= d3
= d4
= 0; // make gcc happy
1269 switch (line
->rot
) {
1274 d0
= (d0
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
1275 d1
= (d1
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
1276 d2
= (d2
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
1277 d3
= (d3
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
1278 d4
= (d4
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
1284 d0
= (d0
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
1285 d1
= (d1
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
1286 d2
= (blk
->xMax
- d2
) / (blk
->xMax
- blk
->xMin
);
1287 d3
= (blk
->xMax
- d3
) / (blk
->xMax
- blk
->xMin
);
1288 d4
= (blk
->xMax
- d4
) / (blk
->xMax
- blk
->xMin
);
1294 d0
= (blk
->xMax
- d0
) / (blk
->xMax
- blk
->xMin
);
1295 d1
= (blk
->xMax
- d1
) / (blk
->xMax
- blk
->xMin
);
1296 d2
= (blk
->yMax
- d2
) / (blk
->yMax
- blk
->yMin
);
1297 d3
= (blk
->yMax
- d3
) / (blk
->yMax
- blk
->yMin
);
1298 d4
= (blk
->yMax
- d4
) / (blk
->yMax
- blk
->yMin
);
1304 d0
= (blk
->yMax
- d0
) / (blk
->yMax
- blk
->yMin
);
1305 d1
= (blk
->yMax
- d1
) / (blk
->yMax
- blk
->yMin
);
1306 d2
= (d2
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
1307 d3
= (d3
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
1308 d4
= (d4
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
1312 switch (line
->blk
->page
->primaryRot
) {
1314 xMin
= blk
->xMin
+ d0
* (blk
->xMax
- blk
->xMin
);
1315 xMax
= blk
->xMin
+ d1
* (blk
->xMax
- blk
->xMin
);
1316 yMin
= blk
->yMin
+ d2
* (blk
->yMax
- blk
->yMin
);
1317 yMax
= blk
->yMin
+ d3
* (blk
->yMax
- blk
->yMin
);
1318 base
= blk
->yMin
+ d4
* (blk
->yMax
- blk
->yMin
);
1321 xMin
= blk
->xMax
- d3
* (blk
->xMax
- blk
->xMin
);
1322 xMax
= blk
->xMax
- d2
* (blk
->xMax
- blk
->xMin
);
1323 yMin
= blk
->yMin
+ d0
* (blk
->yMax
- blk
->yMin
);
1324 yMax
= blk
->yMin
+ d1
* (blk
->yMax
- blk
->yMin
);
1325 base
= blk
->xMax
- d4
* (blk
->xMax
- blk
->xMin
);
1328 xMin
= blk
->xMax
- d1
* (blk
->xMax
- blk
->xMin
);
1329 xMax
= blk
->xMax
- d0
* (blk
->xMax
- blk
->xMin
);
1330 yMin
= blk
->yMax
- d3
* (blk
->yMax
- blk
->yMin
);
1331 yMax
= blk
->yMax
- d2
* (blk
->yMax
- blk
->yMin
);
1332 base
= blk
->yMax
- d4
* (blk
->yMax
- blk
->yMin
);
1335 xMin
= blk
->xMin
+ d2
* (blk
->xMax
- blk
->xMin
);
1336 xMax
= blk
->xMin
+ d3
* (blk
->xMax
- blk
->xMin
);
1337 yMin
= blk
->yMax
- d1
* (blk
->yMax
- blk
->yMin
);
1338 yMax
= blk
->yMax
- d0
* (blk
->yMax
- blk
->yMin
);
1339 base
= blk
->xMin
+ d4
* (blk
->xMax
- blk
->xMin
);
1347 int TextLineFrag::cmpYXPrimaryRot(const void *p1
, const void *p2
) {
1348 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1349 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1352 cmp
= 0; // make gcc happy
1353 switch (frag1
->line
->blk
->page
->primaryRot
) {
1355 if (fabs(cmp
= frag1
->yMin
- frag2
->yMin
) < 0.01) {
1356 cmp
= frag1
->xMin
- frag2
->xMin
;
1360 if (fabs(cmp
= frag2
->xMax
- frag1
->xMax
) < 0.01) {
1361 cmp
= frag1
->yMin
- frag2
->yMin
;
1365 if (fabs(cmp
= frag2
->yMin
- frag1
->yMin
) < 0.01) {
1366 cmp
= frag2
->xMax
- frag1
->xMax
;
1370 if (fabs(cmp
= frag1
->xMax
- frag2
->xMax
) < 0.01) {
1371 cmp
= frag2
->yMax
- frag1
->yMax
;
1375 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1378 int TextLineFrag::cmpYXLineRot(const void *p1
, const void *p2
) {
1379 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1380 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1383 cmp
= 0; // make gcc happy
1384 switch (frag1
->line
->rot
) {
1386 if ((cmp
= frag1
->yMin
- frag2
->yMin
) == 0) {
1387 cmp
= frag1
->xMin
- frag2
->xMin
;
1391 if ((cmp
= frag2
->xMax
- frag1
->xMax
) == 0) {
1392 cmp
= frag1
->yMin
- frag2
->yMin
;
1396 if ((cmp
= frag2
->yMin
- frag1
->yMin
) == 0) {
1397 cmp
= frag2
->xMax
- frag1
->xMax
;
1401 if ((cmp
= frag1
->xMax
- frag2
->xMax
) == 0) {
1402 cmp
= frag2
->yMax
- frag1
->yMax
;
1406 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1409 int TextLineFrag::cmpXYLineRot(const void *p1
, const void *p2
) {
1410 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1411 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1414 cmp
= 0; // make gcc happy
1415 switch (frag1
->line
->rot
) {
1417 if ((cmp
= frag1
->xMin
- frag2
->xMin
) == 0) {
1418 cmp
= frag1
->yMin
- frag2
->yMin
;
1422 if ((cmp
= frag1
->yMin
- frag2
->yMin
) == 0) {
1423 cmp
= frag2
->xMax
- frag1
->xMax
;
1427 if ((cmp
= frag2
->xMax
- frag1
->xMax
) == 0) {
1428 cmp
= frag2
->yMin
- frag1
->yMin
;
1432 if ((cmp
= frag2
->yMax
- frag1
->yMax
) == 0) {
1433 cmp
= frag1
->xMax
- frag2
->xMax
;
1437 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1440 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1
, const void *p2
) {
1441 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1442 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1445 // if columns overlap, compare y values
1446 if (frag1
->col
< frag2
->col
+ (frag2
->line
->col
[frag2
->start
+ frag2
->len
] -
1447 frag2
->line
->col
[frag2
->start
]) &&
1448 frag2
->col
< frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
1449 frag1
->line
->col
[frag1
->start
])) {
1450 cmp
= 0; // make gcc happy
1451 switch (frag1
->line
->blk
->page
->primaryRot
) {
1452 case 0: cmp
= frag1
->yMin
- frag2
->yMin
; break;
1453 case 1: cmp
= frag2
->xMax
- frag1
->xMax
; break;
1454 case 2: cmp
= frag2
->yMin
- frag1
->yMin
; break;
1455 case 3: cmp
= frag1
->xMax
- frag2
->xMax
; break;
1457 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1460 // otherwise, compare starting column
1461 return frag1
->col
- frag2
->col
;
1464 int TextLineFrag::cmpXYColumnLineRot(const void *p1
, const void *p2
) {
1465 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1466 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1469 // if columns overlap, compare y values
1470 if (frag1
->col
< frag2
->col
+ (frag2
->line
->col
[frag2
->start
+ frag2
->len
] -
1471 frag2
->line
->col
[frag2
->start
]) &&
1472 frag2
->col
< frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
1473 frag1
->line
->col
[frag1
->start
])) {
1474 cmp
= 0; // make gcc happy
1475 switch (frag1
->line
->rot
) {
1476 case 0: cmp
= frag1
->yMin
- frag2
->yMin
; break;
1477 case 1: cmp
= frag2
->xMax
- frag1
->xMax
; break;
1478 case 2: cmp
= frag2
->yMin
- frag1
->yMin
; break;
1479 case 3: cmp
= frag1
->xMax
- frag2
->xMax
; break;
1481 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1484 // otherwise, compare starting column
1485 return frag1
->col
- frag2
->col
;
1488 //------------------------------------------------------------------------
1490 //------------------------------------------------------------------------
1492 TextBlock::TextBlock(TextPage
*pageA
, int rotA
) {
1498 priMax
= page
->pageWidth
;
1499 pool
= new TextPool();
1508 TextBlock::~TextBlock() {
1514 lines
= lines
->next
;
1519 void TextBlock::addWord(TextWord
*word
) {
1520 pool
->addWord(word
);
1527 if (word
->xMin
< xMin
) {
1530 if (word
->xMax
> xMax
) {
1533 if (word
->yMin
< yMin
) {
1536 if (word
->yMax
> yMax
) {
1542 void TextBlock::coalesce(UnicodeMap
*uMap
, double fixedPitch
) {
1543 TextWord
*word0
, *word1
, *word2
, *bestWord0
, *bestWord1
, *lastWord
;
1544 TextLine
*line
, *line0
, *line1
;
1545 int poolMinBaseIdx
, startBaseIdx
, minBaseIdx
, maxBaseIdx
;
1546 int baseIdx
, bestWordBaseIdx
, idx0
, idx1
;
1547 double minBase
, maxBase
;
1548 double fontSize
, wordSpacing
, delta
, priDelta
, secDelta
;
1549 TextLine
**lineArray
;
1550 GBool found
, overlap
;
1554 // discard duplicated text (fake boldface, drop shadows)
1555 for (idx0
= pool
->minBaseIdx
; idx0
<= pool
->maxBaseIdx
; ++idx0
) {
1556 word0
= pool
->getPool(idx0
);
1558 priDelta
= dupMaxPriDelta
* word0
->fontSize
;
1559 secDelta
= dupMaxSecDelta
* word0
->fontSize
;
1560 maxBaseIdx
= pool
->getBaseIdx(word0
->base
+ secDelta
);
1562 word1
= word2
= NULL
; // make gcc happy
1563 for (idx1
= idx0
; idx1
<= maxBaseIdx
; ++idx1
) {
1566 word2
= word0
->next
;
1569 word2
= pool
->getPool(idx1
);
1571 for (; word2
; word1
= word2
, word2
= word2
->next
) {
1572 if (word2
->len
== word0
->len
&&
1573 !memcmp(word2
->text
, word0
->text
,
1574 word0
->len
* sizeof(Unicode
))) {
1578 found
= fabs(word0
->xMin
- word2
->xMin
) < priDelta
&&
1579 fabs(word0
->xMax
- word2
->xMax
) < priDelta
&&
1580 fabs(word0
->yMin
- word2
->yMin
) < secDelta
&&
1581 fabs(word0
->yMax
- word2
->yMax
) < secDelta
;
1585 found
= fabs(word0
->xMin
- word2
->xMin
) < secDelta
&&
1586 fabs(word0
->xMax
- word2
->xMax
) < secDelta
&&
1587 fabs(word0
->yMin
- word2
->yMin
) < priDelta
&&
1588 fabs(word0
->yMax
- word2
->yMax
) < priDelta
;
1602 word1
->next
= word2
->next
;
1604 pool
->setPool(idx1
, word2
->next
);
1608 word0
= word0
->next
;
1615 poolMinBaseIdx
= pool
->minBaseIdx
;
1620 // find the first non-empty line in the pool
1622 poolMinBaseIdx
<= pool
->maxBaseIdx
&& !pool
->getPool(poolMinBaseIdx
);
1624 if (poolMinBaseIdx
> pool
->maxBaseIdx
) {
1628 // look for the left-most word in the first four lines of the
1629 // pool -- this avoids starting with a superscript word
1630 startBaseIdx
= poolMinBaseIdx
;
1631 for (baseIdx
= poolMinBaseIdx
+ 1;
1632 baseIdx
< poolMinBaseIdx
+ 4 && baseIdx
<= pool
->maxBaseIdx
;
1634 if (!pool
->getPool(baseIdx
)) {
1637 if (pool
->getPool(baseIdx
)->primaryCmp(pool
->getPool(startBaseIdx
))
1639 startBaseIdx
= baseIdx
;
1643 // create a new line
1644 word0
= pool
->getPool(startBaseIdx
);
1645 pool
->setPool(startBaseIdx
, word0
->next
);
1647 line
= new TextLine(this, word0
->rot
, word0
->base
);
1648 line
->addWord(word0
);
1651 // compute the search range
1652 fontSize
= word0
->fontSize
;
1653 minBase
= word0
->base
- maxIntraLineDelta
* fontSize
;
1654 maxBase
= word0
->base
+ maxIntraLineDelta
* fontSize
;
1655 minBaseIdx
= pool
->getBaseIdx(minBase
);
1656 maxBaseIdx
= pool
->getBaseIdx(maxBase
);
1657 wordSpacing
= fixedPitch
? fixedPitch
: maxWordSpacing
* fontSize
;
1659 // find the rest of the words in this line
1662 // find the left-most word whose baseline is in the range for
1664 bestWordBaseIdx
= 0;
1665 bestWord0
= bestWord1
= NULL
;
1667 for (baseIdx
= minBaseIdx
;
1668 !overlap
&& baseIdx
<= maxBaseIdx
;
1670 for (word0
= NULL
, word1
= pool
->getPool(baseIdx
);
1672 word0
= word1
, word1
= word1
->next
) {
1673 if (word1
->base
>= minBase
&&
1674 word1
->base
<= maxBase
) {
1675 delta
= lastWord
->primaryDelta(word1
);
1676 if (delta
< minCharSpacing
* fontSize
) {
1680 if (delta
< wordSpacing
&&
1681 (!bestWord1
|| word1
->primaryCmp(bestWord1
) < 0)) {
1682 bestWordBaseIdx
= baseIdx
;
1691 if (overlap
|| !bestWord1
) {
1695 // remove it from the pool, and add it to the line
1697 bestWord0
->next
= bestWord1
->next
;
1699 pool
->setPool(bestWordBaseIdx
, bestWord1
->next
);
1701 bestWord1
->next
= NULL
;
1702 line
->addWord(bestWord1
);
1703 lastWord
= bestWord1
;
1707 if (curLine
&& line
->cmpYX(curLine
) > 0) {
1709 line1
= curLine
->next
;
1715 line1
&& line
->cmpYX(line1
) > 0;
1716 line0
= line1
, line1
= line1
->next
) ;
1724 line
->coalesce(uMap
);
1725 charCount
+= line
->len
;
1729 // sort lines into xy order for column assignment
1730 lineArray
= (TextLine
**)gmallocn(nLines
, sizeof(TextLine
*));
1731 for (line
= lines
, i
= 0; line
; line
= line
->next
, ++i
) {
1732 lineArray
[i
] = line
;
1734 qsort(lineArray
, nLines
, sizeof(TextLine
*), &TextLine::cmpXY
);
1736 // column assignment
1739 for (i
= 0; i
< nLines
; ++i
) {
1740 line0
= lineArray
[i
];
1741 col1
= 0; // make gcc happy
1744 col1
= (int)((line0
->xMin
- xMin
) / fixedPitch
+ 0.5);
1747 col1
= (int)((line0
->yMin
- yMin
) / fixedPitch
+ 0.5);
1750 col1
= (int)((xMax
- line0
->xMax
) / fixedPitch
+ 0.5);
1753 col1
= (int)((yMax
- line0
->yMax
) / fixedPitch
+ 0.5);
1756 for (k
= 0; k
<= line0
->len
; ++k
) {
1757 line0
->col
[k
] += col1
;
1759 if (line0
->col
[line0
->len
] > nColumns
) {
1760 nColumns
= line0
->col
[line0
->len
];
1764 for (i
= 0; i
< nLines
; ++i
) {
1765 line0
= lineArray
[i
];
1767 for (j
= 0; j
< i
; ++j
) {
1768 line1
= lineArray
[j
];
1769 if (line1
->primaryDelta(line0
) >= 0) {
1770 col2
= line1
->col
[line1
->len
] + 1;
1772 k
= 0; // make gcc happy
1777 line0
->xMin
>= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1783 line0
->yMin
>= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1789 line0
->xMax
<= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1795 line0
->yMax
<= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1799 col2
= line1
->col
[k
];
1805 for (k
= 0; k
<= line0
->len
; ++k
) {
1806 line0
->col
[k
] += col1
;
1808 if (line0
->col
[line0
->len
] > nColumns
) {
1809 nColumns
= line0
->col
[line0
->len
];
1816 void TextBlock::updatePriMinMax(TextBlock
*blk
) {
1817 double newPriMin
, newPriMax
;
1818 GBool gotPriMin
, gotPriMax
;
1820 gotPriMin
= gotPriMax
= gFalse
;
1821 newPriMin
= newPriMax
= 0; // make gcc happy
1822 switch (page
->primaryRot
) {
1825 if (blk
->yMin
< yMax
&& blk
->yMax
> yMin
) {
1826 if (blk
->xMin
< xMin
) {
1827 newPriMin
= blk
->xMax
;
1830 if (blk
->xMax
> xMax
) {
1831 newPriMax
= blk
->xMin
;
1838 if (blk
->xMin
< xMax
&& blk
->xMax
> xMin
) {
1839 if (blk
->yMin
< yMin
) {
1840 newPriMin
= blk
->yMax
;
1843 if (blk
->yMax
> yMax
) {
1844 newPriMax
= blk
->yMin
;
1851 if (newPriMin
> xMin
) {
1854 if (newPriMin
> priMin
) {
1859 if (newPriMax
< xMax
) {
1862 if (newPriMax
< priMax
) {
1868 int TextBlock::cmpXYPrimaryRot(const void *p1
, const void *p2
) {
1869 TextBlock
*blk1
= *(TextBlock
**)p1
;
1870 TextBlock
*blk2
= *(TextBlock
**)p2
;
1873 cmp
= 0; // make gcc happy
1874 switch (blk1
->page
->primaryRot
) {
1876 if ((cmp
= blk1
->xMin
- blk2
->xMin
) == 0) {
1877 cmp
= blk1
->yMin
- blk2
->yMin
;
1881 if ((cmp
= blk1
->yMin
- blk2
->yMin
) == 0) {
1882 cmp
= blk2
->xMax
- blk1
->xMax
;
1886 if ((cmp
= blk2
->xMax
- blk1
->xMax
) == 0) {
1887 cmp
= blk2
->yMin
- blk1
->yMin
;
1891 if ((cmp
= blk2
->yMax
- blk1
->yMax
) == 0) {
1892 cmp
= blk1
->xMax
- blk2
->xMax
;
1896 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1899 int TextBlock::cmpYXPrimaryRot(const void *p1
, const void *p2
) {
1900 TextBlock
*blk1
= *(TextBlock
**)p1
;
1901 TextBlock
*blk2
= *(TextBlock
**)p2
;
1904 cmp
= 0; // make gcc happy
1905 switch (blk1
->page
->primaryRot
) {
1907 if ((cmp
= blk1
->yMin
- blk2
->yMin
) == 0) {
1908 cmp
= blk1
->xMin
- blk2
->xMin
;
1912 if ((cmp
= blk2
->xMax
- blk1
->xMax
) == 0) {
1913 cmp
= blk1
->yMin
- blk2
->yMin
;
1917 if ((cmp
= blk2
->yMin
- blk1
->yMin
) == 0) {
1918 cmp
= blk2
->xMax
- blk1
->xMax
;
1922 if ((cmp
= blk1
->xMax
- blk2
->xMax
) == 0) {
1923 cmp
= blk2
->yMax
- blk1
->yMax
;
1927 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1930 int TextBlock::primaryCmp(TextBlock
*blk
) {
1933 cmp
= 0; // make gcc happy
1936 cmp
= xMin
- blk
->xMin
;
1939 cmp
= yMin
- blk
->yMin
;
1942 cmp
= blk
->xMax
- xMax
;
1945 cmp
= blk
->yMax
- yMax
;
1948 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1951 double TextBlock::secondaryDelta(TextBlock
*blk
) {
1954 delta
= 0; // make gcc happy
1957 delta
= blk
->yMin
- yMax
;
1960 delta
= xMin
- blk
->xMax
;
1963 delta
= yMin
- blk
->yMax
;
1966 delta
= blk
->xMin
- xMax
;
1972 GBool
TextBlock::isBelow(TextBlock
*blk
) {
1975 below
= gFalse
; // make gcc happy
1976 switch (page
->primaryRot
) {
1978 below
= xMin
>= blk
->priMin
&& xMax
<= blk
->priMax
&&
1982 below
= yMin
>= blk
->priMin
&& yMax
<= blk
->priMax
&&
1986 below
= xMin
>= blk
->priMin
&& xMax
<= blk
->priMax
&&
1990 below
= yMin
>= blk
->priMin
&& yMax
<= blk
->priMax
&&
1998 GBool
TextBlock::isBeforeByRule1(TextBlock
*blk1
) {
1999 GBool before
= gFalse
;
2000 GBool overlap
= gFalse
;
2002 switch (this->page
->primaryRot
) {
2005 overlap
= ((this->ExMin
<= blk1
->ExMin
) &&
2006 (blk1
->ExMin
<= this->ExMax
)) ||
2007 ((blk1
->ExMin
<= this->ExMin
) &&
2008 (this->ExMin
<= blk1
->ExMax
));
2012 overlap
= ((this->EyMin
<= blk1
->EyMin
) &&
2013 (blk1
->EyMin
<= this->EyMax
)) ||
2014 ((blk1
->EyMin
<= this->EyMin
) &&
2015 (this->EyMin
<= blk1
->EyMax
));
2018 switch (this->page
->primaryRot
) {
2020 before
= overlap
&& this->EyMin
< blk1
->EyMin
;
2023 before
= overlap
&& this->ExMax
> blk1
->ExMax
;
2026 before
= overlap
&& this->EyMax
> blk1
->EyMax
;
2029 before
= overlap
&& this->ExMin
< blk1
->ExMin
;
2035 GBool
TextBlock::isBeforeByRule2(TextBlock
*blk1
) {
2039 if (!page
->primaryLR
) {
2040 rotLR
= (rotLR
+ 2) % 4;
2045 cmp
= ExMax
- blk1
->ExMin
;
2048 cmp
= EyMin
- blk1
->EyMax
;
2051 cmp
= blk1
->ExMax
- ExMin
;
2054 cmp
= blk1
->EyMin
- EyMax
;
2060 // Sort into reading order by performing a topological sort using the rules
2061 // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2062 // See http://pubs.iupr.org/#2003-breuel-sdiut
2063 // Topological sort is done by depth first search, see
2064 // http://en.wikipedia.org/wiki/Topological_sorting
2065 int TextBlock::visitDepthFirst(TextBlock
*blkList
, int pos1
,
2066 TextBlock
**sorted
, int sortPos
,
2069 TextBlock
*blk1
, *blk2
, *blk3
;
2072 if (visited
[pos1
]) {
2078 #if 0 // for debugging
2079 printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2080 sortPos
, blk1
->ExMin
, blk1
->ExMax
, blk1
->EyMin
, blk1
->EyMax
);
2082 visited
[pos1
] = gTrue
;
2084 for (blk2
= blkList
; blk2
; blk2
= blk2
->next
) {
2086 if (visited
[pos2
]) {
2087 // skip visited nodes
2092 // is blk2 before blk1? (for table entries)
2093 if (blk1
->tableId
>= 0 && blk1
->tableId
== blk2
->tableId
) {
2094 if (page
->primaryLR
) {
2095 if (blk2
->xMax
<= blk1
->xMin
&&
2096 blk2
->yMin
<= blk1
->yMax
&&
2097 blk2
->yMax
>= blk1
->yMin
)
2100 if (blk2
->xMin
>= blk1
->xMax
&&
2101 blk2
->yMin
<= blk1
->yMax
&&
2102 blk2
->yMax
>= blk1
->yMin
)
2106 if (blk2
->yMax
<= blk1
->yMin
)
2109 if (blk2
->isBeforeByRule1(blk1
)) {
2110 // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2112 #if 0 // for debugging
2113 printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2114 blk2
->ExMin
, blk2
->ExMax
, blk2
->EyMin
, blk2
->EyMax
,
2115 blk1
->ExMin
, blk1
->ExMax
, blk1
->EyMin
, blk1
->EyMax
);
2117 } else if (blk2
->isBeforeByRule2(blk1
)) {
2118 // Rule (2) blk2 left of blk1, and no intervening blk3
2119 // such that blk1 is before blk3 by rule 1,
2120 // and blk3 is before blk2 by rule 1.
2122 for (blk3
= blkList
; blk3
; blk3
= blk3
->next
) {
2123 if (blk3
== blk2
|| blk3
== blk1
) {
2126 if (blk1
->isBeforeByRule1(blk3
) &&
2127 blk3
->isBeforeByRule1(blk2
)) {
2132 #if 0 // for debugging
2134 printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2135 blk1
->ExMin
, blk1
->ExMax
, blk1
->EyMin
, blk1
->EyMax
,
2136 blk2
->ExMin
, blk2
->ExMax
, blk2
->EyMin
, blk2
->EyMax
);
2142 // blk2 is before blk1, so it needs to be visited
2143 // before we can add blk1 to the sorted list.
2144 sortPos
= blk2
->visitDepthFirst(blkList
, pos2
, sorted
, sortPos
, visited
);
2147 #if 0 // for debugging
2148 printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2149 sortPos
, blk1
->ExMin
, blk1
->ExMax
, blk1
->EyMin
, blk1
->EyMax
);
2151 sorted
[sortPos
++] = blk1
;
2155 //------------------------------------------------------------------------
2157 //------------------------------------------------------------------------
2159 TextFlow::TextFlow(TextPage
*pageA
, TextBlock
*blk
) {
2165 priMin
= blk
->priMin
;
2166 priMax
= blk
->priMax
;
2167 blocks
= lastBlk
= blk
;
2171 TextFlow::~TextFlow() {
2176 blocks
= blocks
->next
;
2181 void TextFlow::addBlock(TextBlock
*blk
) {
2183 lastBlk
->next
= blk
;
2188 if (blk
->xMin
< xMin
) {
2191 if (blk
->xMax
> xMax
) {
2194 if (blk
->yMin
< yMin
) {
2197 if (blk
->yMax
> yMax
) {
2202 GBool
TextFlow::blockFits(TextBlock
*blk
, TextBlock
*prevBlk
) {
2205 // lower blocks must use smaller fonts
2206 if (blk
->lines
->words
->fontSize
> lastBlk
->lines
->words
->fontSize
) {
2210 fits
= gFalse
; // make gcc happy
2211 switch (page
->primaryRot
) {
2213 fits
= blk
->xMin
>= priMin
&& blk
->xMax
<= priMax
;
2216 fits
= blk
->yMin
>= priMin
&& blk
->yMax
<= priMax
;
2219 fits
= blk
->xMin
>= priMin
&& blk
->xMax
<= priMax
;
2222 fits
= blk
->yMin
>= priMin
&& blk
->yMax
<= priMax
;
2228 #if TEXTOUT_WORD_LIST
2230 //------------------------------------------------------------------------
2232 //------------------------------------------------------------------------
2234 TextWordList::TextWordList(TextPage
*text
, GBool physLayout
) {
2239 TextWord
**wordArray
;
2242 words
= new GooList();
2244 if (text
->rawOrder
) {
2245 for (word
= text
->rawWords
; word
; word
= word
->next
) {
2246 words
->append(word
);
2249 } else if (physLayout
) {
2250 // this is inefficient, but it's also the least useful of these
2253 for (flow
= text
->flows
; flow
; flow
= flow
->next
) {
2254 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
2255 for (line
= blk
->lines
; line
; line
= line
->next
) {
2256 for (word
= line
->words
; word
; word
= word
->next
) {
2262 wordArray
= (TextWord
**)gmallocn(nWords
, sizeof(TextWord
*));
2264 for (flow
= text
->flows
; flow
; flow
= flow
->next
) {
2265 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
2266 for (line
= blk
->lines
; line
; line
= line
->next
) {
2267 for (word
= line
->words
; word
; word
= word
->next
) {
2268 wordArray
[i
++] = word
;
2273 qsort(wordArray
, nWords
, sizeof(TextWord
*), &TextWord::cmpYX
);
2274 for (i
= 0; i
< nWords
; ++i
) {
2275 words
->append(wordArray
[i
]);
2280 for (flow
= text
->flows
; flow
; flow
= flow
->next
) {
2281 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
2282 for (line
= blk
->lines
; line
; line
= line
->next
) {
2283 for (word
= line
->words
; word
; word
= word
->next
) {
2284 words
->append(word
);
2292 TextWordList::~TextWordList() {
2296 int TextWordList::getLength() {
2297 return words
->getLength();
2300 TextWord
*TextWordList::get(int idx
) {
2301 if (idx
< 0 || idx
>= words
->getLength()) {
2304 return (TextWord
*)words
->get(idx
);
2307 #endif // TEXTOUT_WORD_LIST
2309 //------------------------------------------------------------------------
2311 //------------------------------------------------------------------------
2313 TextPage::TextPage(GBool rawOrderA
) {
2317 rawOrder
= rawOrderA
;
2324 lastCharOverlap
= gFalse
;
2326 for (rot
= 0; rot
< 4; ++rot
) {
2327 pools
[rot
] = new TextPool();
2334 fonts
= new GooList();
2335 lastFindXMin
= lastFindYMin
= 0;
2336 haveLastFind
= gFalse
;
2337 underlines
= new GooList();
2338 links
= new GooList();
2339 mergeCombining
= gTrue
;
2342 TextPage::~TextPage() {
2347 for (rot
= 0; rot
< 4; ++rot
) {
2352 deleteGooList(underlines
, TextUnderline
);
2353 deleteGooList(links
, TextLink
);
2356 void TextPage::incRefCnt() {
2360 void TextPage::decRefCnt() {
2365 void TextPage::startPage(GfxState
*state
) {
2368 pageWidth
= state
->getPageWidth();
2369 pageHeight
= state
->getPageHeight();
2371 pageWidth
= pageHeight
= 0;
2375 void TextPage::endPage() {
2381 void TextPage::clear() {
2393 rawWords
= rawWords
->next
;
2397 for (rot
= 0; rot
< 4; ++rot
) {
2402 flows
= flows
->next
;
2407 deleteGooList(fonts
, TextFontInfo
);
2408 deleteGooList(underlines
, TextUnderline
);
2409 deleteGooList(links
, TextLink
);
2418 for (rot
= 0; rot
< 4; ++rot
) {
2419 pools
[rot
] = new TextPool();
2426 fonts
= new GooList();
2427 underlines
= new GooList();
2428 links
= new GooList();
2431 void TextPage::updateFont(GfxState
*state
) {
2435 int code
, mCode
, letterCode
, anyCode
;
2439 // get the font info object
2441 for (i
= 0; i
< fonts
->getLength(); ++i
) {
2442 curFont
= (TextFontInfo
*)fonts
->get(i
);
2443 if (curFont
->matches(state
)) {
2449 curFont
= new TextFontInfo(state
);
2450 fonts
->append(curFont
);
2453 // adjust the font size
2454 gfxFont
= state
->getFont();
2455 curFontSize
= state
->getTransformedFontSize();
2456 if (gfxFont
&& gfxFont
->getType() == fontType3
) {
2457 // This is a hack which makes it possible to deal with some Type 3
2458 // fonts. The problem is that it's impossible to know what the
2459 // base coordinate system used in the font is without actually
2460 // rendering the font. This code tries to guess by looking at the
2461 // width of the character 'm' (which breaks if the font is a
2462 // subset that doesn't contain 'm').
2463 mCode
= letterCode
= anyCode
= -1;
2464 for (code
= 0; code
< 256; ++code
) {
2465 name
= ((Gfx8BitFont
*)gfxFont
)->getCharName(code
);
2466 int nameLen
= name
? strlen(name
) : 0;
2467 GBool nameOneChar
= nameLen
== 1 || (nameLen
> 1 && name
[1] == '\0');
2468 if (nameOneChar
&& name
[0] == 'm') {
2471 if (letterCode
< 0 && nameOneChar
&&
2472 ((name
[0] >= 'A' && name
[0] <= 'Z') ||
2473 (name
[0] >= 'a' && name
[0] <= 'z'))) {
2476 if (anyCode
< 0 && name
&&
2477 ((Gfx8BitFont
*)gfxFont
)->getWidth(code
) > 0) {
2482 (w
= ((Gfx8BitFont
*)gfxFont
)->getWidth(mCode
)) > 0) {
2483 // 0.6 is a generic average 'm' width -- yes, this is a hack
2484 curFontSize
*= w
/ 0.6;
2485 } else if (letterCode
>= 0 &&
2486 (w
= ((Gfx8BitFont
*)gfxFont
)->getWidth(letterCode
)) > 0) {
2487 // even more of a hack: 0.5 is a generic letter width
2488 curFontSize
*= w
/ 0.5;
2489 } else if (anyCode
>= 0 &&
2490 (w
= ((Gfx8BitFont
*)gfxFont
)->getWidth(anyCode
)) > 0) {
2491 // better than nothing: 0.5 is a generic character width
2492 curFontSize
*= w
/ 0.5;
2494 fm
= gfxFont
->getFontMatrix();
2496 curFontSize
*= fabs(fm
[3] / fm
[0]);
2501 void TextPage::beginWord(GfxState
*state
) {
2507 // This check is needed because Type 3 characters can contain
2508 // text-drawing operations (when TextPage is being used via
2509 // {X,Win}SplashOutputDev rather than TextOutputDev).
2515 // compute the rotation
2516 state
->getFontTransMat(&m
[0], &m
[1], &m
[2], &m
[3]);
2517 gfxFont
= state
->getFont();
2518 if (gfxFont
&& gfxFont
->getType() == fontType3
) {
2519 fontm
= state
->getFont()->getFontMatrix();
2520 m2
[0] = fontm
[0] * m
[0] + fontm
[1] * m
[2];
2521 m2
[1] = fontm
[0] * m
[1] + fontm
[1] * m
[3];
2522 m2
[2] = fontm
[2] * m
[0] + fontm
[3] * m
[2];
2523 m2
[3] = fontm
[2] * m
[1] + fontm
[3] * m
[3];
2529 if (fabs(m
[0] * m
[3]) > fabs(m
[1] * m
[2])) {
2530 rot
= (m
[0] > 0 || m
[3] < 0) ? 0 : 2;
2532 rot
= (m
[2] > 0) ? 1 : 3;
2535 // for vertical writing mode, the lines are effectively rotated 90
2537 if (gfxFont
&& gfxFont
->getWMode()) {
2538 rot
= (rot
+ 1) & 3;
2541 curWord
= new TextWord(state
, rot
, curFontSize
);
2544 void TextPage::addChar(GfxState
*state
, double x
, double y
,
2545 double dx
, double dy
,
2546 CharCode c
, int nBytes
, Unicode
*u
, int uLen
) {
2547 double x1
, y1
, w1
, h1
, dx2
, dy2
, base
, sp
, delta
;
2553 // subtract char and word spacing from the dx,dy values
2554 sp
= state
->getCharSpace();
2555 if (c
== (CharCode
)0x20) {
2556 sp
+= state
->getWordSpace();
2558 state
->textTransformDelta(sp
* state
->getHorizScaling(), 0, &dx2
, &dy2
);
2561 state
->transformDelta(dx
, dy
, &w1
, &h1
);
2563 // throw away chars that aren't inside the page bounds
2564 // (and also do a sanity check on the character size)
2565 state
->transform(x
, y
, &x1
, &y1
);
2566 if (x1
+ w1
< 0 || x1
> pageWidth
||
2567 y1
+ h1
< 0 || y1
> pageHeight
||
2568 x1
!= x1
|| y1
!= y1
|| // IEEE way of checking for isnan
2569 w1
!= w1
|| h1
!= h1
) {
2574 // check the tiny chars limit
2575 if (!globalParams
->getTextKeepTinyChars() &&
2576 fabs(w1
) < 3 && fabs(h1
) < 3) {
2577 if (++nTinyChars
> 50000) {
2583 // break words at space character
2584 if (uLen
== 1 && u
[0] == (Unicode
)0x20) {
2590 state
->getFontTransMat(&mat
.m
[0], &mat
.m
[1], &mat
.m
[2], &mat
.m
[3]);
2594 if (mergeCombining
&& curWord
&& uLen
== 1 &&
2595 curWord
->addCombining(state
, curFont
, curFontSize
, x1
, y1
, w1
, h1
, charPos
, nBytes
, c
,
2601 // start a new word if:
2602 // (1) this character doesn't fall in the right place relative to
2603 // the end of the previous word (this places upper and lower
2604 // constraints on the position deltas along both the primary
2605 // and secondary axes), or
2606 // (2) this character overlaps the previous one (duplicated text), or
2607 // (3) the previous character was an overlap (we want each duplicated
2608 // character to be in a word by itself at this stage),
2609 // (4) the font size has changed
2610 // (5) the WMode changed
2611 if (curWord
&& curWord
->len
> 0) {
2612 base
= sp
= delta
= 0; // make gcc happy
2613 switch (curWord
->rot
) {
2616 sp
= x1
- curWord
->xMax
;
2617 delta
= x1
- curWord
->edge
[curWord
->len
- 1];
2621 sp
= y1
- curWord
->yMax
;
2622 delta
= y1
- curWord
->edge
[curWord
->len
- 1];
2626 sp
= curWord
->xMin
- x1
;
2627 delta
= curWord
->edge
[curWord
->len
- 1] - x1
;
2631 sp
= curWord
->yMin
- y1
;
2632 delta
= curWord
->edge
[curWord
->len
- 1] - y1
;
2635 overlap
= fabs(delta
) < dupMaxPriDelta
* curWord
->fontSize
&&
2636 fabs(base
- curWord
->base
) < dupMaxSecDelta
* curWord
->fontSize
;
2637 wMode
= curFont
->getWMode();
2638 if (overlap
|| lastCharOverlap
||
2639 sp
< -minDupBreakOverlap
* curWord
->fontSize
||
2640 sp
> minWordBreakSpace
* curWord
->fontSize
||
2641 fabs(base
- curWord
->base
) > 0.5 ||
2642 curFontSize
!= curWord
->fontSize
||
2643 wMode
!= curWord
->wMode
) {
2646 lastCharOverlap
= overlap
;
2648 lastCharOverlap
= gFalse
;
2652 // start a new word if needed
2657 // page rotation and/or transform matrices can cause text to be
2658 // drawn in reverse order -- in this case, swap the begin/end
2659 // coordinates and break text into individual chars
2660 if ((curWord
->rot
== 0 && w1
< 0) ||
2661 (curWord
->rot
== 1 && h1
< 0) ||
2662 (curWord
->rot
== 2 && w1
> 0) ||
2663 (curWord
->rot
== 3 && h1
> 0)) {
2672 // add the characters to the current word
2675 for (i
= 0; i
< uLen
; ++i
) {
2676 curWord
->addChar(state
, curFont
, x1
+ i
*w1
, y1
+ i
*h1
, w1
, h1
, charPos
, nBytes
, c
, u
[i
], mat
);
2682 void TextPage::incCharCount(int nChars
) {
2686 void TextPage::endWord() {
2687 // This check is needed because Type 3 characters can contain
2688 // text-drawing operations (when TextPage is being used via
2689 // {X,Win}SplashOutputDev rather than TextOutputDev).
2701 void TextPage::addWord(TextWord
*word
) {
2702 // throw away zero-length words -- they don't have valid xMin/xMax
2703 // values, and they're useless anyway
2704 if (word
->len
== 0) {
2711 rawLastWord
->next
= word
;
2717 pools
[word
->rot
]->addWord(word
);
2721 void TextPage::addUnderline(double x0
, double y0
, double x1
, double y1
) {
2722 underlines
->append(new TextUnderline(x0
, y0
, x1
, y1
));
2725 void TextPage::addLink(int xMin
, int yMin
, int xMax
, int yMax
, AnnotLink
*link
) {
2726 links
->append(new TextLink(xMin
, yMin
, xMax
, yMax
, link
));
2729 void TextPage::coalesce(GBool physLayout
, double fixedPitch
, GBool doHTML
) {
2732 TextWord
*word0
, *word1
, *word2
;
2734 TextBlock
*blkList
, *blk
, *lastBlk
, *blk0
, *blk1
, *blk2
;
2735 TextFlow
*flow
, *lastFlow
;
2736 TextUnderline
*underline
;
2738 int rot
, poolMinBaseIdx
, baseIdx
, startBaseIdx
, endBaseIdx
;
2739 double minBase
, maxBase
, newMinBase
, newMaxBase
;
2740 double fontSize
, colSpace1
, colSpace2
, lineSpace
, intraLineSpace
, blkSpace
;
2753 uMap
= globalParams
->getTextEncoding();
2759 #if 0 // for debugging
2760 printf("*** initial words ***\n");
2761 for (rot
= 0; rot
< 4; ++rot
) {
2763 for (baseIdx
= pool
->minBaseIdx
; baseIdx
<= pool
->maxBaseIdx
; ++baseIdx
) {
2764 for (word0
= pool
->getPool(baseIdx
); word0
; word0
= word0
->next
) {
2765 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2766 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
2767 word0
->base
, word0
->fontSize
, rot
*90, word0
->link
);
2768 for (i
= 0; i
< word0
->len
; ++i
) {
2769 fputc(word0
->text
[i
] & 0xff, stdout
);
2778 #if 0 //~ for debugging
2779 for (i
= 0; i
< underlines
->getLength(); ++i
) {
2780 underline
= (TextUnderline
*)underlines
->get(i
);
2781 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2782 underline
->x0
, underline
->x1
, underline
->y0
, underline
->y1
,
2789 //----- handle underlining
2790 for (i
= 0; i
< underlines
->getLength(); ++i
) {
2791 underline
= (TextUnderline
*)underlines
->get(i
);
2792 if (underline
->horiz
) {
2794 if (pools
[0]->minBaseIdx
<= pools
[0]->maxBaseIdx
) {
2795 startBaseIdx
= pools
[0]->getBaseIdx(underline
->y0
+ minUnderlineGap
);
2796 endBaseIdx
= pools
[0]->getBaseIdx(underline
->y0
+ maxUnderlineGap
);
2797 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2798 for (word0
= pools
[0]->getPool(j
); word0
; word0
= word0
->next
) {
2799 //~ need to check the y value against the word baseline
2800 if (underline
->x0
< word0
->xMin
+ underlineSlack
&&
2801 word0
->xMax
- underlineSlack
< underline
->x1
) {
2802 word0
->underlined
= gTrue
;
2809 if (pools
[2]->minBaseIdx
<= pools
[2]->maxBaseIdx
) {
2810 startBaseIdx
= pools
[2]->getBaseIdx(underline
->y0
- maxUnderlineGap
);
2811 endBaseIdx
= pools
[2]->getBaseIdx(underline
->y0
- minUnderlineGap
);
2812 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2813 for (word0
= pools
[2]->getPool(j
); word0
; word0
= word0
->next
) {
2814 if (underline
->x0
< word0
->xMin
+ underlineSlack
&&
2815 word0
->xMax
- underlineSlack
< underline
->x1
) {
2816 word0
->underlined
= gTrue
;
2823 if (pools
[1]->minBaseIdx
<= pools
[1]->maxBaseIdx
) {
2824 startBaseIdx
= pools
[1]->getBaseIdx(underline
->x0
- maxUnderlineGap
);
2825 endBaseIdx
= pools
[1]->getBaseIdx(underline
->x0
- minUnderlineGap
);
2826 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2827 for (word0
= pools
[1]->getPool(j
); word0
; word0
= word0
->next
) {
2828 if (underline
->y0
< word0
->yMin
+ underlineSlack
&&
2829 word0
->yMax
- underlineSlack
< underline
->y1
) {
2830 word0
->underlined
= gTrue
;
2837 if (pools
[3]->minBaseIdx
<= pools
[3]->maxBaseIdx
) {
2838 startBaseIdx
= pools
[3]->getBaseIdx(underline
->x0
+ minUnderlineGap
);
2839 endBaseIdx
= pools
[3]->getBaseIdx(underline
->x0
+ maxUnderlineGap
);
2840 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2841 for (word0
= pools
[3]->getPool(j
); word0
; word0
= word0
->next
) {
2842 if (underline
->y0
< word0
->yMin
+ underlineSlack
&&
2843 word0
->yMax
- underlineSlack
< underline
->y1
) {
2844 word0
->underlined
= gTrue
;
2852 //----- handle links
2853 for (i
= 0; i
< links
->getLength(); ++i
) {
2854 link
= (TextLink
*)links
->get(i
);
2857 if (pools
[0]->minBaseIdx
<= pools
[0]->maxBaseIdx
) {
2858 startBaseIdx
= pools
[0]->getBaseIdx(link
->yMin
);
2859 endBaseIdx
= pools
[0]->getBaseIdx(link
->yMax
);
2860 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2861 for (word0
= pools
[0]->getPool(j
); word0
; word0
= word0
->next
) {
2862 if (link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2863 word0
->xMax
- hyperlinkSlack
< link
->xMax
&&
2864 link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2865 word0
->yMax
- hyperlinkSlack
< link
->yMax
) {
2866 word0
->link
= link
->link
;
2873 if (pools
[2]->minBaseIdx
<= pools
[2]->maxBaseIdx
) {
2874 startBaseIdx
= pools
[2]->getBaseIdx(link
->yMin
);
2875 endBaseIdx
= pools
[2]->getBaseIdx(link
->yMax
);
2876 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2877 for (word0
= pools
[2]->getPool(j
); word0
; word0
= word0
->next
) {
2878 if (link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2879 word0
->xMax
- hyperlinkSlack
< link
->xMax
&&
2880 link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2881 word0
->yMax
- hyperlinkSlack
< link
->yMax
) {
2882 word0
->link
= link
->link
;
2889 if (pools
[1]->minBaseIdx
<= pools
[1]->maxBaseIdx
) {
2890 startBaseIdx
= pools
[1]->getBaseIdx(link
->xMin
);
2891 endBaseIdx
= pools
[1]->getBaseIdx(link
->xMax
);
2892 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2893 for (word0
= pools
[1]->getPool(j
); word0
; word0
= word0
->next
) {
2894 if (link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2895 word0
->yMax
- hyperlinkSlack
< link
->yMax
&&
2896 link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2897 word0
->xMax
- hyperlinkSlack
< link
->xMax
) {
2898 word0
->link
= link
->link
;
2905 if (pools
[3]->minBaseIdx
<= pools
[3]->maxBaseIdx
) {
2906 startBaseIdx
= pools
[3]->getBaseIdx(link
->xMin
);
2907 endBaseIdx
= pools
[3]->getBaseIdx(link
->xMax
);
2908 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2909 for (word0
= pools
[3]->getPool(j
); word0
; word0
= word0
->next
) {
2910 if (link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2911 word0
->yMax
- hyperlinkSlack
< link
->yMax
&&
2912 link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2913 word0
->xMax
- hyperlinkSlack
< link
->xMax
) {
2914 word0
->link
= link
->link
;
2922 //----- assemble the blocks
2924 //~ add an outer loop for writing mode (vertical text)
2926 // build blocks for each rotation value
2927 for (rot
= 0; rot
< 4; ++rot
) {
2929 poolMinBaseIdx
= pool
->minBaseIdx
;
2932 // add blocks until no more words are left
2935 // find the first non-empty line in the pool
2937 poolMinBaseIdx
<= pool
->maxBaseIdx
&&
2938 !pool
->getPool(poolMinBaseIdx
);
2940 if (poolMinBaseIdx
> pool
->maxBaseIdx
) {
2944 // look for the left-most word in the first four lines of the
2945 // pool -- this avoids starting with a superscript word
2946 startBaseIdx
= poolMinBaseIdx
;
2947 for (baseIdx
= poolMinBaseIdx
+ 1;
2948 baseIdx
< poolMinBaseIdx
+ 4 && baseIdx
<= pool
->maxBaseIdx
;
2950 if (!pool
->getPool(baseIdx
)) {
2953 if (pool
->getPool(baseIdx
)->primaryCmp(pool
->getPool(startBaseIdx
))
2955 startBaseIdx
= baseIdx
;
2959 // create a new block
2960 word0
= pool
->getPool(startBaseIdx
);
2961 pool
->setPool(startBaseIdx
, word0
->next
);
2963 blk
= new TextBlock(this, rot
);
2964 blk
->addWord(word0
);
2966 fontSize
= word0
->fontSize
;
2967 minBase
= maxBase
= word0
->base
;
2968 colSpace1
= minColSpacing1
* fontSize
;
2969 colSpace2
= minColSpacing2
* fontSize
;
2970 lineSpace
= maxLineSpacingDelta
* fontSize
;
2971 intraLineSpace
= maxIntraLineDelta
* fontSize
;
2973 // add words to the block
2977 // look for words on the line above the current top edge of
2979 newMinBase
= minBase
;
2980 for (baseIdx
= pool
->getBaseIdx(minBase
);
2981 baseIdx
>= pool
->getBaseIdx(minBase
- lineSpace
);
2984 word1
= pool
->getPool(baseIdx
);
2986 if (word1
->base
< minBase
&&
2987 word1
->base
>= minBase
- lineSpace
&&
2988 ((rot
== 0 || rot
== 2)
2989 ? (word1
->xMin
< blk
->xMax
&& word1
->xMax
> blk
->xMin
)
2990 : (word1
->yMin
< blk
->yMax
&& word1
->yMax
> blk
->yMin
)) &&
2991 fabs(word1
->fontSize
- fontSize
) <
2992 maxBlockFontSizeDelta1
* fontSize
) {
2995 word0
->next
= word1
->next
;
2997 pool
->setPool(baseIdx
, word1
->next
);
2999 word1
= word1
->next
;
3001 blk
->addWord(word2
);
3003 newMinBase
= word2
->base
;
3006 word1
= word1
->next
;
3010 minBase
= newMinBase
;
3012 // look for words on the line below the current bottom edge of
3014 newMaxBase
= maxBase
;
3015 for (baseIdx
= pool
->getBaseIdx(maxBase
);
3016 baseIdx
<= pool
->getBaseIdx(maxBase
+ lineSpace
);
3019 word1
= pool
->getPool(baseIdx
);
3021 if (word1
->base
> maxBase
&&
3022 word1
->base
<= maxBase
+ lineSpace
&&
3023 ((rot
== 0 || rot
== 2)
3024 ? (word1
->xMin
< blk
->xMax
&& word1
->xMax
> blk
->xMin
)
3025 : (word1
->yMin
< blk
->yMax
&& word1
->yMax
> blk
->yMin
)) &&
3026 fabs(word1
->fontSize
- fontSize
) <
3027 maxBlockFontSizeDelta1
* fontSize
) {
3030 word0
->next
= word1
->next
;
3032 pool
->setPool(baseIdx
, word1
->next
);
3034 word1
= word1
->next
;
3036 blk
->addWord(word2
);
3038 newMaxBase
= word2
->base
;
3041 word1
= word1
->next
;
3045 maxBase
= newMaxBase
;
3047 // look for words that are on lines already in the block, and
3048 // that overlap the block horizontally
3049 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
3050 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
3053 word1
= pool
->getPool(baseIdx
);
3055 if (word1
->base
>= minBase
- intraLineSpace
&&
3056 word1
->base
<= maxBase
+ intraLineSpace
&&
3057 ((rot
== 0 || rot
== 2)
3058 ? (word1
->xMin
< blk
->xMax
+ colSpace1
&&
3059 word1
->xMax
> blk
->xMin
- colSpace1
)
3060 : (word1
->yMin
< blk
->yMax
+ colSpace1
&&
3061 word1
->yMax
> blk
->yMin
- colSpace1
)) &&
3062 fabs(word1
->fontSize
- fontSize
) <
3063 maxBlockFontSizeDelta2
* fontSize
) {
3066 word0
->next
= word1
->next
;
3068 pool
->setPool(baseIdx
, word1
->next
);
3070 word1
= word1
->next
;
3072 blk
->addWord(word2
);
3076 word1
= word1
->next
;
3081 // only check for outlying words (the next two chunks of code)
3082 // if we didn't find anything else
3087 // scan down the left side of the block, looking for words
3088 // that are near (but not overlapping) the block; if there are
3089 // three or fewer, add them to the block
3091 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
3092 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
3094 word1
= pool
->getPool(baseIdx
);
3096 if (word1
->base
>= minBase
- intraLineSpace
&&
3097 word1
->base
<= maxBase
+ intraLineSpace
&&
3098 ((rot
== 0 || rot
== 2)
3099 ? (word1
->xMax
<= blk
->xMin
&&
3100 word1
->xMax
> blk
->xMin
- colSpace2
)
3101 : (word1
->yMax
<= blk
->yMin
&&
3102 word1
->yMax
> blk
->yMin
- colSpace2
)) &&
3103 fabs(word1
->fontSize
- fontSize
) <
3104 maxBlockFontSizeDelta3
* fontSize
) {
3108 word1
= word1
->next
;
3111 if (n
> 0 && n
<= 3) {
3112 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
3113 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
3116 word1
= pool
->getPool(baseIdx
);
3118 if (word1
->base
>= minBase
- intraLineSpace
&&
3119 word1
->base
<= maxBase
+ intraLineSpace
&&
3120 ((rot
== 0 || rot
== 2)
3121 ? (word1
->xMax
<= blk
->xMin
&&
3122 word1
->xMax
> blk
->xMin
- colSpace2
)
3123 : (word1
->yMax
<= blk
->yMin
&&
3124 word1
->yMax
> blk
->yMin
- colSpace2
)) &&
3125 fabs(word1
->fontSize
- fontSize
) <
3126 maxBlockFontSizeDelta3
* fontSize
) {
3129 word0
->next
= word1
->next
;
3131 pool
->setPool(baseIdx
, word1
->next
);
3133 word1
= word1
->next
;
3135 blk
->addWord(word2
);
3136 if (word2
->base
< minBase
) {
3137 minBase
= word2
->base
;
3138 } else if (word2
->base
> maxBase
) {
3139 maxBase
= word2
->base
;
3145 word1
= word1
->next
;
3151 // scan down the right side of the block, looking for words
3152 // that are near (but not overlapping) the block; if there are
3153 // three or fewer, add them to the block
3155 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
3156 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
3158 word1
= pool
->getPool(baseIdx
);
3160 if (word1
->base
>= minBase
- intraLineSpace
&&
3161 word1
->base
<= maxBase
+ intraLineSpace
&&
3162 ((rot
== 0 || rot
== 2)
3163 ? (word1
->xMin
>= blk
->xMax
&&
3164 word1
->xMin
< blk
->xMax
+ colSpace2
)
3165 : (word1
->yMin
>= blk
->yMax
&&
3166 word1
->yMin
< blk
->yMax
+ colSpace2
)) &&
3167 fabs(word1
->fontSize
- fontSize
) <
3168 maxBlockFontSizeDelta3
* fontSize
) {
3172 word1
= word1
->next
;
3175 if (n
> 0 && n
<= 3) {
3176 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
3177 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
3180 word1
= pool
->getPool(baseIdx
);
3182 if (word1
->base
>= minBase
- intraLineSpace
&&
3183 word1
->base
<= maxBase
+ intraLineSpace
&&
3184 ((rot
== 0 || rot
== 2)
3185 ? (word1
->xMin
>= blk
->xMax
&&
3186 word1
->xMin
< blk
->xMax
+ colSpace2
)
3187 : (word1
->yMin
>= blk
->yMax
&&
3188 word1
->yMin
< blk
->yMax
+ colSpace2
)) &&
3189 fabs(word1
->fontSize
- fontSize
) <
3190 maxBlockFontSizeDelta3
* fontSize
) {
3193 word0
->next
= word1
->next
;
3195 pool
->setPool(baseIdx
, word1
->next
);
3197 word1
= word1
->next
;
3199 blk
->addWord(word2
);
3200 if (word2
->base
< minBase
) {
3201 minBase
= word2
->base
;
3202 } else if (word2
->base
> maxBase
) {
3203 maxBase
= word2
->base
;
3209 word1
= word1
->next
;
3217 //~ need to compute the primary writing mode (horiz/vert) in
3218 //~ addition to primary rotation
3220 // coalesce the block, and add it to the list
3221 blk
->coalesce(uMap
, fixedPitch
);
3223 lastBlk
->next
= blk
;
3228 count
[rot
] += blk
->charCount
;
3232 if (count
[rot
] > count
[primaryRot
]) {
3237 #if 0 // for debugging
3238 printf("*** rotation ***\n");
3239 for (rot
= 0; rot
< 4; ++rot
) {
3240 printf(" %d: %6d\n", rot
, count
[rot
]);
3242 printf(" primary rot = %d\n", primaryRot
);
3246 #if 0 // for debugging
3247 printf("*** blocks ***\n");
3248 for (blk
= blkList
; blk
; blk
= blk
->next
) {
3249 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3250 blk
->rot
, blk
->xMin
, blk
->xMax
, blk
->yMin
, blk
->yMax
);
3251 for (line
= blk
->lines
; line
; line
= line
->next
) {
3252 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3253 line
->xMin
, line
->xMax
, line
->yMin
, line
->yMax
, line
->base
);
3254 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
3255 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3256 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
3257 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
3258 for (i
= 0; i
< word0
->len
; ++i
) {
3259 fputc(word0
->text
[i
] & 0xff, stdout
);
3268 // determine the primary direction
3270 for (blk
= blkList
; blk
; blk
= blk
->next
) {
3271 for (line
= blk
->lines
; line
; line
= line
->next
) {
3272 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
3273 for (i
= 0; i
< word0
->len
; ++i
) {
3274 if (unicodeTypeL(word0
->text
[i
])) {
3276 } else if (unicodeTypeR(word0
->text
[i
])) {
3283 primaryLR
= lrCount
>= 0;
3285 #if 0 // for debugging
3286 printf("*** direction ***\n");
3287 printf("lrCount = %d\n", lrCount
);
3288 printf("primaryLR = %d\n", primaryLR
);
3291 //----- column assignment
3293 // sort blocks into xy order for column assignment
3296 if (physLayout
&& fixedPitch
) {
3298 blocks
= (TextBlock
**)gmallocn(nBlocks
, sizeof(TextBlock
*));
3299 for (blk
= blkList
, i
= 0; blk
; blk
= blk
->next
, ++i
) {
3301 col1
= 0; // make gcc happy
3302 switch (primaryRot
) {
3304 col1
= (int)(blk
->xMin
/ fixedPitch
+ 0.5);
3307 col1
= (int)(blk
->yMin
/ fixedPitch
+ 0.5);
3310 col1
= (int)((pageWidth
- blk
->xMax
) / fixedPitch
+ 0.5);
3313 col1
= (int)((pageHeight
- blk
->yMax
) / fixedPitch
+ 0.5);
3317 for (line
= blk
->lines
; line
; line
= line
->next
) {
3318 for (j
= 0; j
<= line
->len
; ++j
) {
3319 line
->col
[j
] += col1
;
3326 // sort blocks into xy order for column assignment
3327 blocks
= (TextBlock
**)gmallocn(nBlocks
, sizeof(TextBlock
*));
3328 for (blk
= blkList
, i
= 0; blk
; blk
= blk
->next
, ++i
) {
3331 qsort(blocks
, nBlocks
, sizeof(TextBlock
*), &TextBlock::cmpXYPrimaryRot
);
3333 // column assignment
3334 for (i
= 0; i
< nBlocks
; ++i
) {
3337 for (j
= 0; j
< i
; ++j
) {
3339 col2
= 0; // make gcc happy
3340 switch (primaryRot
) {
3342 if (blk0
->xMin
> blk1
->xMax
) {
3343 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
3344 } else if (blk1
->xMax
== blk1
->xMin
) {
3347 col2
= blk1
->col
+ (int)(((blk0
->xMin
- blk1
->xMin
) /
3348 (blk1
->xMax
- blk1
->xMin
)) *
3353 if (blk0
->yMin
> blk1
->yMax
) {
3354 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
3355 } else if (blk1
->yMax
== blk1
->yMin
) {
3358 col2
= blk1
->col
+ (int)(((blk0
->yMin
- blk1
->yMin
) /
3359 (blk1
->yMax
- blk1
->yMin
)) *
3364 if (blk0
->xMax
< blk1
->xMin
) {
3365 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
3366 } else if (blk1
->xMin
== blk1
->xMax
) {
3369 col2
= blk1
->col
+ (int)(((blk0
->xMax
- blk1
->xMax
) /
3370 (blk1
->xMin
- blk1
->xMax
)) *
3375 if (blk0
->yMax
< blk1
->yMin
) {
3376 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
3377 } else if (blk1
->yMin
== blk1
->yMax
) {
3380 col2
= blk1
->col
+ (int)(((blk0
->yMax
- blk1
->yMax
) /
3381 (blk1
->yMin
- blk1
->yMax
)) *
3391 for (line
= blk0
->lines
; line
; line
= line
->next
) {
3392 for (j
= 0; j
<= line
->len
; ++j
) {
3393 line
->col
[j
] += col1
;
3400 #if 0 // for debugging
3401 printf("*** blocks, after column assignment ***\n");
3402 for (blk
= blkList
; blk
; blk
= blk
->next
) {
3403 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3404 blk
->rot
, blk
->xMin
, blk
->xMax
, blk
->yMin
, blk
->yMax
, blk
->col
,
3406 for (line
= blk
->lines
; line
; line
= line
->next
) {
3407 printf(" line: col[0]=%d\n", line
->col
[0]);
3408 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
3409 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3410 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
3411 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
3412 for (i
= 0; i
< word0
->len
; ++i
) {
3413 fputc(word0
->text
[i
] & 0xff, stdout
);
3422 //----- reading order sort
3424 // compute space on left and right sides of each block
3425 for (i
= 0; i
< nBlocks
; ++i
) {
3427 for (j
= 0; j
< nBlocks
; ++j
) {
3430 blk0
->updatePriMinMax(blk1
);
3435 #if 0 // for debugging
3440 GBool
*visited
= (GBool
*)gmallocn(nBlocks
, sizeof(GBool
));
3441 for (i
= 0; i
< nBlocks
; i
++) {
3442 visited
[i
] = gFalse
;
3445 double bxMin0
, byMin0
, bxMin1
, byMin1
;
3448 int correspondenceX
, correspondenceY
;
3449 double xCentre1
, yCentre1
, xCentre2
, yCentre2
;
3450 double xCentre3
, yCentre3
, xCentre4
, yCentre4
;
3451 double deltaX
, deltaY
;
3452 TextBlock
*fblk2
= NULL
, *fblk3
= NULL
, *fblk4
= NULL
;
3454 for (blk1
= blkList
; blk1
; blk1
= blk1
->next
) {
3455 blk1
->ExMin
= blk1
->xMin
;
3456 blk1
->ExMax
= blk1
->xMax
;
3457 blk1
->EyMin
= blk1
->yMin
;
3458 blk1
->EyMax
= blk1
->yMax
;
3469 /* find fblk2, fblk3 and fblk4 so that
3470 * fblk2 is on the right of blk1 and overlap with blk1 in y axis
3471 * fblk3 is under blk1 and overlap with blk1 in x axis
3472 * fblk4 is under blk1 and on the right of blk1
3473 * and they are closest to blk1
3475 for (blk2
= blkList
; blk2
; blk2
= blk2
->next
) {
3477 if (blk2
->yMin
<= blk1
->yMax
&&
3478 blk2
->yMax
>= blk1
->yMin
&&
3479 blk2
->xMin
> blk1
->xMax
&&
3480 blk2
->xMin
< bxMin0
) {
3481 bxMin0
= blk2
->xMin
;
3483 } else if (blk2
->xMin
<= blk1
->xMax
&&
3484 blk2
->xMax
>= blk1
->xMin
&&
3485 blk2
->yMin
> blk1
->yMax
&&
3486 blk2
->yMin
< byMin0
) {
3487 byMin0
= blk2
->yMin
;
3489 } else if (blk2
->xMin
> blk1
->xMax
&&
3490 blk2
->xMin
< bxMin1
&&
3491 blk2
->yMin
> blk1
->yMax
&&
3492 blk2
->yMin
< byMin1
) {
3493 bxMin1
= blk2
->xMin
;
3494 byMin1
= blk2
->yMin
;
3500 /* fblk4 can not overlap with fblk3 in x and with fblk2 in y
3501 * fblk2 can not overlap with fblk3 in x and y
3502 * fblk4 has to overlap with fblk3 in y and with fblk2 in x
3504 if (fblk2
!= NULL
&&
3507 if (((fblk3
->xMin
<= fblk4
->xMax
&& fblk3
->xMax
>= fblk4
->xMin
) ||
3508 (fblk2
->yMin
<= fblk4
->yMax
&& fblk2
->yMax
>= fblk4
->yMin
) ||
3509 (fblk2
->xMin
<= fblk3
->xMax
&& fblk2
->xMax
>= fblk3
->xMin
) ||
3510 (fblk2
->yMin
<= fblk3
->yMax
&& fblk2
->yMax
>= fblk3
->yMin
)) ||
3511 !(fblk4
->xMin
<= fblk2
->xMax
&& fblk4
->xMax
>= fblk2
->xMin
&&
3512 fblk4
->yMin
<= fblk3
->yMax
&& fblk4
->yMax
>= fblk3
->yMin
)) {
3519 // if we found any then look whether they form a table
3520 if (fblk2
!= NULL
&&
3524 correspondenceX
= 0;
3525 correspondenceY
= 0;
3529 if (blk1
->lines
&& blk1
->lines
->words
)
3530 deltaX
= blk1
->lines
->words
->getFontSize();
3531 if (fblk2
->lines
&& fblk2
->lines
->words
)
3532 deltaX
= deltaX
< fblk2
->lines
->words
->getFontSize() ?
3533 deltaX
: fblk2
->lines
->words
->getFontSize();
3534 if (fblk3
->lines
&& fblk3
->lines
->words
)
3535 deltaX
= deltaX
< fblk3
->lines
->words
->getFontSize() ?
3536 deltaX
: fblk3
->lines
->words
->getFontSize();
3537 if (fblk4
->lines
&& fblk4
->lines
->words
)
3538 deltaX
= deltaX
< fblk4
->lines
->words
->getFontSize() ?
3539 deltaX
: fblk4
->lines
->words
->getFontSize();
3543 deltaX
*= minColSpacing1
;
3544 deltaY
*= maxIntraLineDelta
;
3546 xCentre1
= (blk1
->xMax
+ blk1
->xMin
) / 2.0;
3547 yCentre1
= (blk1
->yMax
+ blk1
->yMin
) / 2.0;
3548 xCentre2
= (fblk2
->xMax
+ fblk2
->xMin
) / 2.0;
3549 yCentre2
= (fblk2
->yMax
+ fblk2
->yMin
) / 2.0;
3550 xCentre3
= (fblk3
->xMax
+ fblk3
->xMin
) / 2.0;
3551 yCentre3
= (fblk3
->yMax
+ fblk3
->yMin
) / 2.0;
3552 xCentre4
= (fblk4
->xMax
+ fblk4
->xMin
) / 2.0;
3553 yCentre4
= (fblk4
->yMax
+ fblk4
->yMin
) / 2.0;
3555 // are blocks centrally aligned in x ?
3556 if (fabs (xCentre1
- xCentre3
) <= deltaX
&&
3557 fabs (xCentre2
- xCentre4
) <= deltaX
)
3560 // are blocks centrally aligned in y ?
3561 if (fabs (yCentre1
- yCentre2
) <= deltaY
&&
3562 fabs (yCentre3
- yCentre4
) <= deltaY
)
3565 // are blocks aligned to the left ?
3566 if (fabs (blk1
->xMin
- fblk3
->xMin
) <= deltaX
&&
3567 fabs (fblk2
->xMin
- fblk4
->xMin
) <= deltaX
)
3570 // are blocks aligned to the right ?
3571 if (fabs (blk1
->xMax
- fblk3
->xMax
) <= deltaX
&&
3572 fabs (fblk2
->xMax
- fblk4
->xMax
) <= deltaX
)
3575 // are blocks aligned to the top ?
3576 if (fabs (blk1
->yMin
- fblk2
->yMin
) <= deltaY
&&
3577 fabs (fblk3
->yMin
- fblk4
->yMin
) <= deltaY
)
3580 // are blocks aligned to the bottom ?
3581 if (fabs (blk1
->yMax
- fblk2
->yMax
) <= deltaY
&&
3582 fabs (fblk3
->yMax
- fblk4
->yMax
) <= deltaY
)
3585 // are blocks aligned in x and y ?
3586 if (correspondenceX
> 0 &&
3587 correspondenceY
> 0) {
3589 // find maximal tableId
3590 tableId
= tableId
< fblk4
->tableId
? fblk4
->tableId
: tableId
;
3591 tableId
= tableId
< fblk3
->tableId
? fblk3
->tableId
: tableId
;
3592 tableId
= tableId
< fblk2
->tableId
? fblk2
->tableId
: tableId
;
3593 tableId
= tableId
< blk1
->tableId
? blk1
->tableId
: tableId
;
3595 // if the tableId is -1, then we found new table
3597 tableId
= numTables
;
3601 blk1
->tableId
= tableId
;
3602 fblk2
->tableId
= tableId
;
3603 fblk3
->tableId
= tableId
;
3604 fblk4
->tableId
= tableId
;
3609 /* set extended bounding boxes of all table entries
3610 * so that they contain whole table
3611 * (we need to process whole table size when comparing it
3612 * with regular text blocks)
3614 PDFRectangle
*envelopes
= new PDFRectangle
[numTables
];
3615 TextBlock
**ending_blocks
= new TextBlock
* [numTables
];
3617 for (i
= 0; i
< numTables
; i
++) {
3618 envelopes
[i
].x1
= DBL_MAX
;
3619 envelopes
[i
].x2
= DBL_MIN
;
3620 envelopes
[i
].y1
= DBL_MAX
;
3621 envelopes
[i
].y2
= DBL_MIN
;
3624 for (blk1
= blkList
; blk1
; blk1
= blk1
->next
) {
3625 if (blk1
->tableId
>= 0) {
3626 if (blk1
->ExMin
< envelopes
[blk1
->tableId
].x1
) {
3627 envelopes
[blk1
->tableId
].x1
= blk1
->ExMin
;
3628 if (!blk1
->page
->primaryLR
)
3629 ending_blocks
[blk1
->tableId
] = blk1
;
3632 if (blk1
->ExMax
> envelopes
[blk1
->tableId
].x2
) {
3633 envelopes
[blk1
->tableId
].x2
= blk1
->ExMax
;
3634 if (blk1
->page
->primaryLR
)
3635 ending_blocks
[blk1
->tableId
] = blk1
;
3638 envelopes
[blk1
->tableId
].y1
= blk1
->EyMin
< envelopes
[blk1
->tableId
].y1
?
3639 blk1
->EyMin
: envelopes
[blk1
->tableId
].y1
;
3640 envelopes
[blk1
->tableId
].y2
= blk1
->EyMax
> envelopes
[blk1
->tableId
].y2
?
3641 blk1
->EyMax
: envelopes
[blk1
->tableId
].y2
;
3645 for (blk1
= blkList
; blk1
; blk1
= blk1
->next
) {
3646 if (blk1
->tableId
>= 0 &&
3647 blk1
->xMin
<= ending_blocks
[blk1
->tableId
]->xMax
&&
3648 blk1
->xMax
>= ending_blocks
[blk1
->tableId
]->xMin
) {
3649 blk1
->tableEnd
= gTrue
;
3653 for (blk1
= blkList
; blk1
; blk1
= blk1
->next
) {
3654 if (blk1
->tableId
>= 0) {
3655 blk1
->ExMin
= envelopes
[blk1
->tableId
].x1
;
3656 blk1
->ExMax
= envelopes
[blk1
->tableId
].x2
;
3657 blk1
->EyMin
= envelopes
[blk1
->tableId
].y1
;
3658 blk1
->EyMax
= envelopes
[blk1
->tableId
].y2
;
3662 delete[] ending_blocks
;
3665 /* set extended bounding boxes of all other blocks
3666 * so that they extend in x without hitting neighbours
3668 for (blk1
= blkList
; blk1
; blk1
= blk1
->next
) {
3669 if (!(blk1
->tableId
>= 0)) {
3670 double xMax
= DBL_MAX
;
3671 double xMin
= DBL_MIN
;
3673 for (blk2
= blkList
; blk2
; blk2
= blk2
->next
) {
3677 if (blk1
->yMin
<= blk2
->yMax
&& blk1
->yMax
>= blk2
->yMin
) {
3678 if (blk2
->xMin
< xMax
&& blk2
->xMin
> blk1
->xMax
)
3681 if (blk2
->xMax
> xMin
&& blk2
->xMax
< blk1
->xMin
)
3686 for (blk2
= blkList
; blk2
; blk2
= blk2
->next
) {
3690 if (blk2
->xMax
> blk1
->ExMax
&&
3691 blk2
->xMax
<= xMax
&&
3692 blk2
->yMin
>= blk1
->yMax
) {
3693 blk1
->ExMax
= blk2
->xMax
;
3696 if (blk2
->xMin
< blk1
->ExMin
&&
3697 blk2
->xMin
>= xMin
&&
3698 blk2
->yMin
>= blk1
->yMax
)
3699 blk1
->ExMin
= blk2
->xMin
;
3705 for (blk1
= blkList
; blk1
; blk1
= blk1
->next
) {
3707 sortPos
= blk1
->visitDepthFirst(blkList
, i
, blocks
, sortPos
, visited
);
3713 #if 0 // for debugging
3714 printf("*** blocks, after ro sort ***\n");
3715 for (i
= 0; i
< nBlocks
; ++i
) {
3717 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3718 blk
->rot
, blk
->xMin
, blk
->xMax
, blk
->yMin
, blk
->yMax
,
3719 blk
->priMin
, blk
->priMax
);
3720 for (line
= blk
->lines
; line
; line
= line
->next
) {
3722 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
3723 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3724 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
3725 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
3726 for (j
= 0; j
< word0
->len
; ++j
) {
3727 fputc(word0
->text
[j
] & 0xff, stdout
);
3738 //~ this needs to be adjusted for writing mode (vertical text)
3739 //~ this also needs to account for right-to-left column ordering
3743 flows
= flows
->next
;
3746 flows
= lastFlow
= NULL
;
3747 // assume blocks are already in reading order,
3748 // and construct flows accordingly.
3749 for (i
= 0; i
< nBlocks
; i
++) {
3753 blk1
= blocks
[i
- 1];
3754 blkSpace
= maxBlockSpacing
* blk1
->lines
->words
->fontSize
;
3755 if (blk1
->secondaryDelta(blk
) <= blkSpace
&&
3756 blk
->isBelow(blk1
) &&
3757 flow
->blockFits(blk
, blk1
)) {
3758 flow
->addBlock(blk
);
3762 flow
= new TextFlow(this, blk
);
3764 lastFlow
->next
= flow
;
3771 #if 0 // for debugging
3772 printf("*** flows ***\n");
3773 for (flow
= flows
; flow
; flow
= flow
->next
) {
3774 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3775 flow
->xMin
, flow
->xMax
, flow
->yMin
, flow
->yMax
,
3776 flow
->priMin
, flow
->priMax
);
3777 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
3778 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3779 blk
->rot
, blk
->ExMin
, blk
->ExMax
, blk
->EyMin
, blk
->EyMax
,
3780 blk
->priMin
, blk
->priMax
);
3781 for (line
= blk
->lines
; line
; line
= line
->next
) {
3783 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
3784 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3785 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
3786 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
3787 for (i
= 0; i
< word0
->len
; ++i
) {
3788 fputc(word0
->text
[i
] & 0xff, stdout
);
3803 GBool
TextPage::findText(Unicode
*s
, int len
,
3804 GBool startAtTop
, GBool stopAtBottom
,
3805 GBool startAtLast
, GBool stopAtLast
,
3806 GBool caseSensitive
, GBool backward
,
3808 double *xMin
, double *yMin
,
3809 double *xMax
, double *yMax
) {
3812 Unicode
*s2
, *txt
, *reordered
;
3814 int txtSize
, m
, i
, j
, k
;
3815 double xStart
, yStart
, xStop
, yStop
;
3816 double xMin0
, yMin0
, xMax0
, yMax0
;
3817 double xMin1
, yMin1
, xMax1
, yMax1
;
3825 // handle right-to-left text
3826 reordered
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
3827 reorderText(s
, len
, NULL
, primaryLR
, NULL
, reordered
);
3829 // normalize the search string
3830 s2
= unicodeNormalizeNFKC(reordered
, len
, &len
, NULL
);
3832 // convert the search string to uppercase
3833 if (!caseSensitive
) {
3834 for (i
= 0; i
< len
; ++i
) {
3835 s2
[i
] = unicodeToUpper(s2
[i
]);
3842 xStart
= yStart
= xStop
= yStop
= 0;
3843 if (startAtLast
&& haveLastFind
) {
3844 xStart
= lastFindXMin
;
3845 yStart
= lastFindYMin
;
3846 } else if (!startAtTop
) {
3850 if (stopAtLast
&& haveLastFind
) {
3851 xStop
= lastFindXMin
;
3852 yStop
= lastFindYMin
;
3853 } else if (!stopAtBottom
) {
3859 xMin0
= xMax0
= yMin0
= yMax0
= 0; // make gcc happy
3860 xMin1
= xMax1
= yMin1
= yMax1
= 0; // make gcc happy
3862 for (i
= backward
? nBlocks
- 1 : 0;
3863 backward
? i
>= 0 : i
< nBlocks
;
3864 i
+= backward
? -1 : 1) {
3867 // check: is the block above the top limit?
3868 // (this only works if the page's primary rotation is zero --
3869 // otherwise the blocks won't be sorted in the useful order)
3870 if (!startAtTop
&& primaryRot
== 0 &&
3871 (backward
? blk
->yMin
> yStart
: blk
->yMax
< yStart
)) {
3875 // check: is the block below the bottom limit?
3876 // (this only works if the page's primary rotation is zero --
3877 // otherwise the blocks won't be sorted in the useful order)
3878 if (!stopAtBottom
&& primaryRot
== 0 &&
3879 (backward
? blk
->yMax
< yStop
: blk
->yMin
> yStop
)) {
3883 for (line
= blk
->lines
; line
; line
= line
->next
) {
3885 // check: is the line above the top limit?
3886 // (this only works if the page's primary rotation is zero --
3887 // otherwise the lines won't be sorted in the useful order)
3888 if (!startAtTop
&& primaryRot
== 0 &&
3889 (backward
? line
->yMin
> yStart
: line
->yMin
< yStart
)) {
3893 // check: is the line below the bottom limit?
3894 // (this only works if the page's primary rotation is zero --
3895 // otherwise the lines won't be sorted in the useful order)
3896 if (!stopAtBottom
&& primaryRot
== 0 &&
3897 (backward
? line
->yMin
< yStop
: line
->yMin
> yStop
)) {
3901 if (!line
->normalized
)
3902 line
->normalized
= unicodeNormalizeNFKC(line
->text
, line
->len
,
3903 &line
->normalized_len
,
3904 &line
->normalized_idx
,
3906 // convert the line to uppercase
3907 m
= line
->normalized_len
;
3908 if (!caseSensitive
) {
3910 txt
= (Unicode
*)greallocn(txt
, m
, sizeof(Unicode
));
3913 for (k
= 0; k
< m
; ++k
) {
3914 txt
[k
] = unicodeToUpper(line
->normalized
[k
]);
3917 txt
= line
->normalized
;
3920 // search each position in this line
3921 j
= backward
? m
- len
: 0;
3923 while (backward
? j
>= 0 : j
<= m
- len
) {
3925 ((j
== 0 || !unicodeTypeAlphaNum(txt
[j
- 1])) &&
3926 (j
+ len
== m
|| !unicodeTypeAlphaNum(txt
[j
+ len
])))) {
3928 // compare the strings
3929 for (k
= 0; k
< len
; ++k
) {
3930 if (p
[k
] != s2
[k
]) {
3937 // where s2 matches a subsequence of a compatibility equivalence
3938 // decomposition, highlight the entire glyph, since we don't know
3939 // the internal layout of subglyph components
3940 int normStart
= line
->normalized_idx
[j
];
3941 int normAfterEnd
= line
->normalized_idx
[j
+ len
- 1] + 1;
3942 switch (line
->rot
) {
3944 xMin1
= line
->edge
[normStart
];
3945 xMax1
= line
->edge
[normAfterEnd
];
3952 yMin1
= line
->edge
[normStart
];
3953 yMax1
= line
->edge
[normAfterEnd
];
3956 xMin1
= line
->edge
[normAfterEnd
];
3957 xMax1
= line
->edge
[normStart
];
3964 yMin1
= line
->edge
[normAfterEnd
];
3965 yMax1
= line
->edge
[normStart
];
3970 yMin1
< yStart
|| (yMin1
== yStart
&& xMin1
< xStart
)) &&
3972 yMin1
> yStop
|| (yMin1
== yStop
&& xMin1
> xStop
))) {
3974 yMin1
> yMin0
|| (yMin1
== yMin0
&& xMin1
> xMin0
)) {
3984 yMin1
> yStart
|| (yMin1
== yStart
&& xMin1
> xStart
)) &&
3986 yMin1
< yStop
|| (yMin1
== yStop
&& xMin1
< xStop
))) {
3988 yMin1
< yMin0
|| (yMin1
== yMin0
&& xMin1
< xMin0
)) {
4012 if (!caseSensitive
) {
4021 lastFindXMin
= xMin0
;
4022 lastFindYMin
= yMin0
;
4023 haveLastFind
= gTrue
;
4030 GooString
*TextPage::getText(double xMin
, double yMin
,
4031 double xMax
, double yMax
) {
4036 TextLineFrag
*frags
;
4037 int nFrags
, fragsSize
;
4039 char space
[8], eol
[16];
4040 int spaceLen
, eolLen
;
4043 int col
, idx0
, idx1
, i
, j
;
4044 GBool multiLine
, oneRot
;
4046 s
= new GooString();
4048 // get the output encoding
4049 if (!(uMap
= globalParams
->getTextEncoding())) {
4058 for (word
= rawWords
; word
&& word
<= rawLastWord
; word
= word
->next
) {
4059 for (j
= 0; j
< word
->getLength(); ++j
) {
4060 double gXMin
, gXMax
, gYMin
, gYMax
;
4061 word
->getCharBBox(j
, &gXMin
, &gYMin
, &gXMax
, &gYMax
);
4062 if (xMin
<= gXMin
&& gXMax
<= xMax
&& yMin
<= gYMin
&& gYMax
<= yMax
)
4064 mbc_len
= uMap
->mapUnicode( *(word
->getChar(j
)), mbc
, sizeof(mbc
) );
4065 s
->append(mbc
, mbc_len
);
4072 spaceLen
= uMap
->mapUnicode(0x20, space
, sizeof(space
));
4073 eolLen
= 0; // make gcc happy
4074 switch (globalParams
->getTextEOL()) {
4076 eolLen
= uMap
->mapUnicode(0x0a, eol
, sizeof(eol
));
4079 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
4080 eolLen
+= uMap
->mapUnicode(0x0a, eol
+ eolLen
, sizeof(eol
) - eolLen
);
4083 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
4087 //~ writing mode (horiz/vert)
4089 // collect the line fragments that are in the rectangle
4091 frags
= (TextLineFrag
*)gmallocn(fragsSize
, sizeof(TextLineFrag
));
4095 for (i
= 0; i
< nBlocks
; ++i
) {
4097 if (xMin
< blk
->xMax
&& blk
->xMin
< xMax
&&
4098 yMin
< blk
->yMax
&& blk
->yMin
< yMax
) {
4099 for (line
= blk
->lines
; line
; line
= line
->next
) {
4100 if (xMin
< line
->xMax
&& line
->xMin
< xMax
&&
4101 yMin
< line
->yMax
&& line
->yMin
< yMax
) {
4103 switch (line
->rot
) {
4105 y
= 0.5 * (line
->yMin
+ line
->yMax
);
4106 if (yMin
< y
&& y
< yMax
) {
4108 while (j
< line
->len
) {
4109 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > xMin
) {
4117 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < xMax
) {
4126 x
= 0.5 * (line
->xMin
+ line
->xMax
);
4127 if (xMin
< x
&& x
< xMax
) {
4129 while (j
< line
->len
) {
4130 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > yMin
) {
4138 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < yMax
) {
4147 y
= 0.5 * (line
->yMin
+ line
->yMax
);
4148 if (yMin
< y
&& y
< yMax
) {
4150 while (j
< line
->len
) {
4151 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < xMax
) {
4159 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > xMin
) {
4168 x
= 0.5 * (line
->xMin
+ line
->xMax
);
4169 if (xMin
< x
&& x
< xMax
) {
4171 while (j
< line
->len
) {
4172 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < yMax
) {
4180 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > yMin
) {
4189 if (idx0
>= 0 && idx1
>= 0) {
4190 if (nFrags
== fragsSize
) {
4192 frags
= (TextLineFrag
*)
4193 greallocn(frags
, fragsSize
, sizeof(TextLineFrag
));
4195 frags
[nFrags
].init(line
, idx0
, idx1
- idx0
+ 1);
4197 if (lastRot
>= 0 && line
->rot
!= lastRot
) {
4200 lastRot
= line
->rot
;
4207 // sort the fragments and generate the string
4210 for (i
= 0; i
< nFrags
; ++i
) {
4211 frags
[i
].computeCoords(oneRot
);
4213 assignColumns(frags
, nFrags
, oneRot
);
4215 // if all lines in the region have the same rotation, use it;
4216 // otherwise, use the page's primary rotation
4218 qsort(frags
, nFrags
, sizeof(TextLineFrag
),
4219 &TextLineFrag::cmpYXLineRot
);
4221 qsort(frags
, nFrags
, sizeof(TextLineFrag
),
4222 &TextLineFrag::cmpYXPrimaryRot
);
4225 while (i
< nFrags
) {
4226 delta
= maxIntraLineDelta
* frags
[i
].line
->words
->fontSize
;
4228 j
< nFrags
&& fabs(frags
[j
].base
- frags
[i
].base
) < delta
;
4230 qsort(frags
+ i
, j
- i
, sizeof(TextLineFrag
),
4231 oneRot
? &TextLineFrag::cmpXYColumnLineRot
4232 : &TextLineFrag::cmpXYColumnPrimaryRot
);
4238 for (i
= 0; i
< nFrags
; ++i
) {
4242 if (frag
->col
< col
||
4243 (i
> 0 && fabs(frag
->base
- frags
[i
-1].base
) >
4244 maxIntraLineDelta
* frags
[i
-1].line
->words
->fontSize
)) {
4245 s
->append(eol
, eolLen
);
4251 for (; col
< frag
->col
; ++col
) {
4252 s
->append(space
, spaceLen
);
4255 // get the fragment text
4256 col
+= dumpFragment(frag
->line
->text
+ frag
->start
, frag
->len
, uMap
, s
);
4260 s
->append(eol
, eolLen
);
4270 class TextSelectionVisitor
{
4272 TextSelectionVisitor (TextPage
*page
);
4273 virtual ~TextSelectionVisitor () { }
4274 virtual void visitBlock (TextBlock
*block
,
4277 PDFRectangle
*selection
) = 0;
4278 virtual void visitLine (TextLine
*line
,
4283 PDFRectangle
*selection
) = 0;
4284 virtual void visitWord (TextWord
*word
, int begin
, int end
,
4285 PDFRectangle
*selection
) = 0;
4291 TextSelectionVisitor::TextSelectionVisitor (TextPage
*page
)
4297 class TextSelectionDumper
: public TextSelectionVisitor
{
4299 TextSelectionDumper(TextPage
*page
);
4300 virtual ~TextSelectionDumper();
4302 virtual void visitBlock (TextBlock
*block
,
4305 PDFRectangle
*selection
) { };
4306 virtual void visitLine (TextLine
*line
,
4311 PDFRectangle
*selection
);
4312 virtual void visitWord (TextWord
*word
, int begin
, int end
,
4313 PDFRectangle
*selection
);
4316 GooString
*getText(void);
4317 GooList
**takeWordList(int *nLines
);
4325 int nLines
, linesSize
;
4328 TextBlock
*currentBlock
;
4331 TextSelectionDumper::TextSelectionDumper(TextPage
*page
)
4332 : TextSelectionVisitor(page
)
4335 lines
= (GooList
**)gmallocn(linesSize
, sizeof(GooList
*));
4339 currentBlock
= NULL
;
4343 TextSelectionDumper::~TextSelectionDumper()
4345 for (int i
= 0; i
< nLines
; i
++)
4346 deleteGooList(lines
[i
], TextWordSelection
);
4350 void TextSelectionDumper::startLine()
4353 words
= new GooList();
4356 void TextSelectionDumper::finishLine()
4358 if (nLines
== linesSize
) {
4360 lines
= (GooList
**)grealloc(lines
, linesSize
* sizeof(GooList
*));
4363 if (words
&& words
->getLength() > 0)
4364 lines
[nLines
++] = words
;
4370 void TextSelectionDumper::visitLine (TextLine
*line
,
4375 PDFRectangle
*selection
)
4379 frag
.init(line
, edge_begin
, edge_end
- edge_begin
);
4381 if (tableId
>= 0 && frag
.line
->blk
->tableId
< 0) {
4385 currentBlock
= NULL
;
4388 if (frag
.line
->blk
->tableId
>= 0) { // a table
4389 if (tableId
== -1) {
4390 tableId
= frag
.line
->blk
->tableId
;
4391 currentBlock
= frag
.line
->blk
;
4394 if (currentBlock
== frag
.line
->blk
) { // the same block
4396 } else { // another block
4397 if (currentBlock
->tableEnd
) { // previous block ended its row
4400 currentBlock
= frag
.line
->blk
;
4402 } else { // not a table
4407 void TextSelectionDumper::visitWord (TextWord
*word
, int begin
, int end
,
4408 PDFRectangle
*selection
)
4410 words
->append(new TextWordSelection(word
, begin
, end
));
4413 void TextSelectionDumper::endPage()
4418 GooString
*TextSelectionDumper::getText (void)
4423 char space
[8], eol
[16];
4424 int spaceLen
, eolLen
;
4426 text
= new GooString();
4428 if (!(uMap
= globalParams
->getTextEncoding()))
4431 spaceLen
= uMap
->mapUnicode(0x20, space
, sizeof(space
));
4432 eolLen
= uMap
->mapUnicode(0x0a, eol
, sizeof(eol
));
4434 for (i
= 0; i
< nLines
; i
++) {
4435 GooList
*lineWords
= lines
[i
];
4436 for (j
= 0; j
< lineWords
->getLength(); j
++) {
4437 TextWordSelection
*sel
= (TextWordSelection
*)lineWords
->get(j
);
4439 page
->dumpFragment (sel
->word
->text
+ sel
->begin
, sel
->end
- sel
->begin
, uMap
, text
);
4440 if (j
< lineWords
->getLength() - 1)
4441 text
->append(space
, spaceLen
);
4444 text
->append(eol
, eolLen
);
4452 GooList
**TextSelectionDumper::takeWordList(int *nLinesOut
)
4454 GooList
**returnValue
= lines
;
4456 *nLinesOut
= nLines
;
4466 class TextSelectionSizer
: public TextSelectionVisitor
{
4468 TextSelectionSizer(TextPage
*page
, double scale
);
4469 ~TextSelectionSizer() { }
4471 virtual void visitBlock (TextBlock
*block
,
4474 PDFRectangle
*selection
) { };
4475 virtual void visitLine (TextLine
*line
,
4480 PDFRectangle
*selection
);
4481 virtual void visitWord (TextWord
*word
, int begin
, int end
,
4482 PDFRectangle
*selection
) { };
4484 GooList
*getRegion () { return list
; }
4491 TextSelectionSizer::TextSelectionSizer(TextPage
*page
, double scale
)
4492 : TextSelectionVisitor(page
),
4495 list
= new GooList();
4498 void TextSelectionSizer::visitLine (TextLine
*line
,
4503 PDFRectangle
*selection
)
4506 double x1
, y1
, x2
, y2
, margin
;
4508 margin
= (line
->yMax
- line
->yMin
) / 8;
4509 x1
= line
->edge
[edge_begin
];
4510 y1
= line
->yMin
- margin
;
4511 x2
= line
->edge
[edge_end
];
4512 y2
= line
->yMax
+ margin
;
4514 rect
= new PDFRectangle (floor (x1
* scale
),
4518 list
->append (rect
);
4522 class TextSelectionPainter
: public TextSelectionVisitor
{
4524 TextSelectionPainter(TextPage
*page
,
4528 GfxColor
*box_color
,
4529 GfxColor
*glyph_color
);
4530 ~TextSelectionPainter();
4532 virtual void visitBlock (TextBlock
*block
,
4535 PDFRectangle
*selection
) { };
4536 virtual void visitLine (TextLine
*line
,
4541 PDFRectangle
*selection
);
4542 virtual void visitWord (TextWord
*word
, int begin
, int end
,
4543 PDFRectangle
*selection
);
4548 GfxColor
*box_color
, *glyph_color
;
4550 GooList
*selectionList
;
4554 TextSelectionPainter::TextSelectionPainter(TextPage
*page
,
4558 GfxColor
*box_color
,
4559 GfxColor
*glyph_color
)
4560 : TextSelectionVisitor(page
),
4562 box_color(box_color
),
4563 glyph_color(glyph_color
)
4565 PDFRectangle
box(0, 0, page
->pageWidth
, page
->pageHeight
);
4567 selectionList
= new GooList();
4568 state
= new GfxState(72 * scale
, 72 * scale
, &box
, rotation
, gFalse
);
4570 state
->getCTM(&ctm
);
4571 ctm
.invertTo(&ictm
);
4573 out
->startPage(0, state
, NULL
);
4574 out
->setDefaultCTM (state
->getCTM());
4576 state
->setFillColorSpace(new GfxDeviceRGBColorSpace());
4577 state
->setFillColor(box_color
);
4578 out
->updateFillColor(state
);
4581 TextSelectionPainter::~TextSelectionPainter()
4583 deleteGooList(selectionList
, TextWordSelection
);
4587 void TextSelectionPainter::visitLine (TextLine
*line
,
4592 PDFRectangle
*selection
)
4594 double x1
, y1
, x2
, y2
, margin
;
4596 margin
= (line
->yMax
- line
->yMin
) / 8;
4597 x1
= floor (line
->edge
[edge_begin
]);
4598 y1
= floor (line
->yMin
- margin
);
4599 x2
= ceil (line
->edge
[edge_end
]);
4600 y2
= ceil (line
->yMax
+ margin
);
4602 ctm
.transform(line
->edge
[edge_begin
], line
->yMin
- margin
, &x1
, &y1
);
4603 ctm
.transform(line
->edge
[edge_end
], line
->yMax
+ margin
, &x2
, &y2
);
4610 ictm
.transform(x1
, y1
, &x1
, &y1
);
4611 ictm
.transform(x2
, y2
, &x2
, &y2
);
4613 state
->moveTo(x1
, y1
);
4614 state
->lineTo(x2
, y1
);
4615 state
->lineTo(x2
, y2
);
4616 state
->lineTo(x1
, y2
);
4620 void TextSelectionPainter::visitWord (TextWord
*word
, int begin
, int end
,
4621 PDFRectangle
*selection
)
4623 selectionList
->append(new TextWordSelection(word
, begin
, end
));
4626 void TextSelectionPainter::endPage()
4630 out
->saveState(state
);
4635 state
->setFillColor(glyph_color
);
4636 out
->updateFillColor(state
);
4638 for (int i
= 0; i
< selectionList
->getLength(); i
++) {
4639 TextWordSelection
*sel
= (TextWordSelection
*) selectionList
->get(i
);
4640 int begin
= sel
->begin
;
4642 while (begin
< sel
->end
) {
4643 TextFontInfo
*font
= sel
->word
->font
[begin
];
4644 font
->gfxFont
->incRefCnt();
4645 Matrix
*mat
= &sel
->word
->textMat
[begin
];
4647 state
->setTextMat(mat
->m
[0], mat
->m
[1], mat
->m
[2], mat
->m
[3], 0, 0);
4648 state
->setFont(font
->gfxFont
, 1);
4649 out
->updateFont(state
);
4651 int fEnd
= begin
+ 1;
4652 while (fEnd
< sel
->end
&& font
->matches(sel
->word
->font
[fEnd
]) &&
4653 mat
->m
[0] == sel
->word
->textMat
[fEnd
].m
[0] &&
4654 mat
->m
[1] == sel
->word
->textMat
[fEnd
].m
[1] &&
4655 mat
->m
[2] == sel
->word
->textMat
[fEnd
].m
[2] &&
4656 mat
->m
[3] == sel
->word
->textMat
[fEnd
].m
[3])
4659 /* The only purpose of this string is to let the output device query
4660 * it's length. Might want to change this interface later. */
4661 GooString
*string
= new GooString ((char *) sel
->word
->charcode
, fEnd
- begin
);
4662 out
->beginString(state
, string
);
4664 for (int i
= begin
; i
< fEnd
; i
++) {
4665 if (i
!= begin
&& sel
->word
->charPos
[i
] == sel
->word
->charPos
[i
- 1])
4668 out
->drawChar(state
, sel
->word
->textMat
[i
].m
[4], sel
->word
->textMat
[i
].m
[5], 0, 0, 0, 0,
4669 sel
->word
->charcode
[i
], 1, NULL
, 0);
4671 out
->endString(state
);
4677 out
->restoreState(state
);
4681 void TextWord::visitSelection(TextSelectionVisitor
*visitor
,
4682 PDFRectangle
*selection
,
4683 SelectionStyle style
)
4690 for (i
= 0; i
< len
; i
++) {
4691 mid
= (edge
[i
] + edge
[i
+ 1]) / 2;
4692 if (selection
->x1
< mid
|| selection
->x2
< mid
)
4695 if (mid
< selection
->x1
|| mid
< selection
->x2
)
4699 /* Skip empty selection. */
4703 visitor
->visitWord (this, begin
, end
, selection
);
4706 void TextLine::visitSelection(TextSelectionVisitor
*visitor
,
4707 PDFRectangle
*selection
,
4708 SelectionStyle style
) {
4709 TextWord
*p
, *begin
, *end
, *current
;
4710 int i
, edge_begin
, edge_end
;
4711 PDFRectangle child_selection
;
4716 for (p
= words
; p
!= NULL
; p
= p
->next
) {
4717 if (blk
->page
->primaryLR
) {
4718 if ((selection
->x1
< p
->xMax
) ||
4719 (selection
->x2
< p
->xMax
))
4723 if (((selection
->x1
> p
->xMin
) ||
4724 (selection
->x2
> p
->xMin
)) && (begin
!= NULL
)) {
4729 if ((selection
->x1
> p
->xMin
) ||
4730 (selection
->x2
> p
->xMin
))
4734 if (((selection
->x1
< p
->xMax
) ||
4735 (selection
->x2
< p
->xMax
)) && (begin
!= NULL
)) {
4745 child_selection
= *selection
;
4746 if (style
== selectionStyleWord
) {
4747 child_selection
.x1
= begin
? begin
->xMin
: xMin
;
4748 if (end
&& end
->xMax
!= -1) {
4749 child_selection
.x2
= current
->xMax
;
4751 child_selection
.x2
= xMax
;
4757 for (i
= 0; i
< len
; i
++) {
4758 double mid
= (edge
[i
] + edge
[i
+ 1]) / 2;
4759 if (child_selection
.x1
< mid
|| child_selection
.x2
< mid
)
4762 if (mid
< child_selection
.x2
|| mid
< child_selection
.x1
)
4766 /* Skip empty selection. */
4767 if (edge_end
<= edge_begin
)
4770 visitor
->visitLine (this, begin
, end
, edge_begin
, edge_end
,
4773 for (p
= begin
; p
!= end
; p
= p
->next
)
4774 p
->visitSelection (visitor
, &child_selection
, style
);
4777 void TextBlock::visitSelection(TextSelectionVisitor
*visitor
,
4778 PDFRectangle
*selection
,
4779 SelectionStyle style
) {
4780 PDFRectangle child_selection
;
4781 double x
[2], y
[2], d
, best_d
[2];
4782 TextLine
*p
, *best_line
[2];
4783 int i
, count
= 0, best_count
[2], start
, stop
;
4786 x
[0] = selection
->x1
;
4787 y
[0] = selection
->y1
;
4788 x
[1] = selection
->x2
;
4789 y
[1] = selection
->y2
;
4791 for (i
= 0; i
< 2; i
++) {
4792 // the first/last lines are often not nearest
4793 // the corners, so we have to force them to be
4794 // selected when the selection runs outside this
4796 if (page
->primaryLR
) {
4797 all
[i
] = x
[i
] >= this->xMax
&& y
[i
] >= this->yMax
;
4798 if (x
[i
] <= this->xMin
&& y
[i
] <= this->yMin
) {
4799 best_line
[i
] = this->lines
;
4802 best_line
[i
] = NULL
;
4806 all
[i
] = x
[i
] <= this->xMin
&& y
[i
] >= this->yMax
;
4807 if (x
[i
] >= this->xMax
&& y
[i
] <= this->yMin
) {
4808 best_line
[i
] = this->lines
;
4811 best_line
[i
] = NULL
;
4818 // find the nearest line to the selection points
4819 // using the manhattan distance.
4820 for (p
= this->lines
; p
; p
= p
->next
) {
4822 for (i
= 0; i
< 2; i
++) {
4823 d
= fmax(p
->xMin
- x
[i
], 0.0) +
4824 fmax(x
[i
] - p
->xMax
, 0.0) +
4825 fmax(p
->yMin
- y
[i
], 0.0) +
4826 fmax(y
[i
] - p
->yMax
, 0.0);
4827 if (!best_line
[i
] || all
[i
] ||
4830 best_count
[i
] = count
;
4835 // assert: best is always set.
4836 if (!best_line
[0] || !best_line
[1]) {
4840 // Now decide which point was first.
4841 if (best_count
[0] < best_count
[1] ||
4842 (best_count
[0] == best_count
[1] &&
4851 visitor
->visitBlock(this, best_line
[start
], best_line
[stop
], selection
);
4853 for (p
= best_line
[start
]; p
; p
= p
->next
) {
4854 if (page
->primaryLR
) {
4855 child_selection
.x1
= p
->xMin
;
4856 child_selection
.x2
= p
->xMax
;
4858 child_selection
.x1
= p
->xMax
;
4859 child_selection
.x2
= p
->xMin
;
4861 child_selection
.y1
= p
->yMin
;
4862 child_selection
.y2
= p
->yMax
;
4863 if (style
== selectionStyleLine
) {
4864 if (p
== best_line
[start
]) {
4865 child_selection
.x1
= 0;
4866 child_selection
.y1
= 0;
4868 if (p
== best_line
[stop
]) {
4869 child_selection
.x2
= page
->pageWidth
;
4870 child_selection
.y2
= page
->pageHeight
;
4873 if (p
== best_line
[start
]) {
4874 child_selection
.x1
= fmax(p
->xMin
, fmin(p
->xMax
, x
[start
]));
4875 child_selection
.y1
= fmax(p
->yMin
, fmin(p
->yMax
, y
[start
]));
4877 if (p
== best_line
[stop
]) {
4878 child_selection
.x2
= fmax(p
->xMin
, fmin(p
->xMax
, x
[stop
]));
4879 child_selection
.y2
= fmax(p
->yMin
, fmin(p
->yMax
, y
[stop
]));
4882 p
->visitSelection(visitor
, &child_selection
, style
);
4883 if (p
== best_line
[stop
]) {
4889 void TextPage::visitSelection(TextSelectionVisitor
*visitor
,
4890 PDFRectangle
*selection
,
4891 SelectionStyle style
)
4893 PDFRectangle child_selection
;
4894 double x
[2], y
[2], d
, best_d
[2];
4895 double xMin
, yMin
, xMax
, yMax
;
4896 TextFlow
*flow
, *best_flow
[2];
4897 TextBlock
*blk
, *best_block
[2];
4898 int i
, count
= 0, best_count
[2], start
, stop
;
4903 x
[0] = selection
->x1
;
4904 y
[0] = selection
->y1
;
4905 x
[1] = selection
->x2
;
4906 y
[1] = selection
->y2
;
4913 for (i
= 0; i
< 2; i
++) {
4914 best_block
[i
] = NULL
;
4915 best_flow
[i
] = NULL
;
4920 // find the nearest blocks to the selection points
4921 // using the manhattan distance.
4922 for (flow
= flows
; flow
; flow
= flow
->next
) {
4923 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
4925 // the first/last blocks in reading order are
4926 // often not the closest to the page corners;
4927 // track the corners, force those blocks to
4928 // be selected if the selection runs across
4930 xMin
= fmin(xMin
, blk
->xMin
);
4931 yMin
= fmin(yMin
, blk
->yMin
);
4932 xMax
= fmax(xMax
, blk
->xMax
);
4933 yMax
= fmax(yMax
, blk
->yMax
);
4934 for (i
= 0; i
< 2; i
++) {
4935 d
= fmax(blk
->xMin
- x
[i
], 0.0) +
4936 fmax(x
[i
] - blk
->xMax
, 0.0) +
4937 fmax(blk
->yMin
- y
[i
], 0.0) +
4938 fmax(y
[i
] - blk
->yMax
, 0.0);
4939 if (!best_block
[i
] ||
4941 (!blk
->next
&& !flow
->next
&&
4942 x
[i
] >= fmin(xMax
, pageWidth
) &&
4943 y
[i
] >= fmin(yMax
, pageHeight
))) {
4944 best_block
[i
] = blk
;
4945 best_flow
[i
] = flow
;
4946 best_count
[i
] = count
;
4952 for (i
= 0; i
< 2; i
++) {
4954 if (x
[i
] < xMin
&& y
[i
] < yMin
) {
4955 best_block
[i
] = flows
->blocks
;
4956 best_flow
[i
] = flows
;
4960 if (x
[i
] > xMax
&& y
[i
] < yMin
) {
4961 best_block
[i
] = flows
->blocks
;
4962 best_flow
[i
] = flows
;
4967 // assert: best is always set.
4968 if (!best_block
[0] || !best_block
[1]) {
4972 // Now decide which point was first.
4973 if (best_count
[0] < best_count
[1] ||
4974 (best_count
[0] == best_count
[1] && y
[0] < y
[1])) {
4982 for (flow
= best_flow
[start
]; flow
; flow
= flow
->next
) {
4983 if (flow
== best_flow
[start
]) {
4984 blk
= best_block
[start
];
4988 for (; blk
; blk
= blk
->next
) {
4990 child_selection
.x1
= blk
->xMin
;
4991 child_selection
.x2
= blk
->xMax
;
4993 child_selection
.x1
= blk
->xMax
;
4994 child_selection
.x2
= blk
->xMin
;
4996 child_selection
.y1
= blk
->yMin
;
4997 child_selection
.y2
= blk
->yMax
;
4998 if (blk
== best_block
[start
]) {
4999 child_selection
.x1
= fmax(blk
->xMin
, fmin(blk
->xMax
, x
[start
]));
5000 child_selection
.y1
= fmax(blk
->yMin
, fmin(blk
->yMax
, y
[start
]));
5002 if (blk
== best_block
[stop
]) {
5003 child_selection
.x2
= fmax(blk
->xMin
, fmin(blk
->xMax
, x
[stop
]));
5004 child_selection
.y2
= fmax(blk
->yMin
, fmin(blk
->yMax
, y
[stop
]));
5005 blk
->visitSelection(visitor
, &child_selection
, style
);
5008 blk
->visitSelection(visitor
, &child_selection
, style
);
5013 void TextPage::drawSelection(OutputDev
*out
,
5016 PDFRectangle
*selection
,
5017 SelectionStyle style
,
5018 GfxColor
*glyph_color
, GfxColor
*box_color
)
5020 TextSelectionPainter
painter(this, scale
, rotation
,
5021 out
, box_color
, glyph_color
);
5023 visitSelection(&painter
, selection
, style
);
5027 GooList
*TextPage::getSelectionRegion(PDFRectangle
*selection
,
5028 SelectionStyle style
,
5030 TextSelectionSizer
sizer(this, scale
);
5032 visitSelection(&sizer
, selection
, style
);
5034 return sizer
.getRegion();
5037 GooString
*TextPage::getSelectionText(PDFRectangle
*selection
,
5038 SelectionStyle style
)
5040 TextSelectionDumper
dumper(this);
5042 visitSelection(&dumper
, selection
, style
);
5045 return dumper
.getText();
5048 GooList
**TextPage::getSelectionWords(PDFRectangle
*selection
,
5049 SelectionStyle style
,
5052 TextSelectionDumper
dumper(this);
5054 visitSelection(&dumper
, selection
, style
);
5057 return dumper
.takeWordList(nLines
);
5060 GBool
TextPage::findCharRange(int pos
, int length
,
5061 double *xMin
, double *yMin
,
5062 double *xMax
, double *yMax
) {
5066 double xMin0
, xMax0
, yMin0
, yMax0
;
5067 double xMin1
, xMax1
, yMin1
, yMax1
;
5075 //~ this doesn't correctly handle ranges split across multiple lines
5076 //~ (the highlighted region is the bounding box of all the parts of
5079 xMin0
= xMax0
= yMin0
= yMax0
= 0; // make gcc happy
5080 xMin1
= xMax1
= yMin1
= yMax1
= 0; // make gcc happy
5081 for (i
= 0; i
< nBlocks
; ++i
) {
5083 for (line
= blk
->lines
; line
; line
= line
->next
) {
5084 for (word
= line
->words
; word
; word
= word
->next
) {
5085 if (pos
< word
->charPos
[word
->len
] &&
5086 pos
+ length
> word
->charPos
[0]) {
5088 j0
< word
->len
&& pos
>= word
->charPos
[j0
+ 1];
5090 for (j1
= word
->len
- 1;
5091 j1
> j0
&& pos
+ length
<= word
->charPos
[j1
];
5093 switch (line
->rot
) {
5095 xMin1
= word
->edge
[j0
];
5096 xMax1
= word
->edge
[j1
+ 1];
5103 yMin1
= word
->edge
[j0
];
5104 yMax1
= word
->edge
[j1
+ 1];
5107 xMin1
= word
->edge
[j1
+ 1];
5108 xMax1
= word
->edge
[j0
];
5115 yMin1
= word
->edge
[j1
+ 1];
5116 yMax1
= word
->edge
[j0
];
5119 if (first
|| xMin1
< xMin0
) {
5122 if (first
|| xMax1
> xMax0
) {
5125 if (first
|| yMin1
< yMin0
) {
5128 if (first
|| yMax1
> yMax0
) {
5146 void TextPage::dump(void *outputStream
, TextOutputFunc outputFunc
,
5152 TextLineFrag
*frags
;
5154 int nFrags
, fragsSize
;
5156 char space
[8], eol
[16], eop
[8];
5157 int spaceLen
, eolLen
, eopLen
;
5161 int col
, i
, j
, d
, n
;
5163 // get the output encoding
5164 if (!(uMap
= globalParams
->getTextEncoding())) {
5167 spaceLen
= uMap
->mapUnicode(0x20, space
, sizeof(space
));
5168 eolLen
= 0; // make gcc happy
5169 switch (globalParams
->getTextEOL()) {
5171 eolLen
= uMap
->mapUnicode(0x0a, eol
, sizeof(eol
));
5174 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
5175 eolLen
+= uMap
->mapUnicode(0x0a, eol
+ eolLen
, sizeof(eol
) - eolLen
);
5178 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
5181 eopLen
= uMap
->mapUnicode(0x0c, eop
, sizeof(eop
));
5182 pageBreaks
= globalParams
->getTextPageBreaks();
5184 //~ writing mode (horiz/vert)
5186 // output the page in raw (content stream) order
5189 for (word
= rawWords
; word
; word
= word
->next
) {
5190 s
= new GooString();
5191 dumpFragment(word
->text
, word
->len
, uMap
, s
);
5192 (*outputFunc
)(outputStream
, s
->getCString(), s
->getLength());
5195 fabs(word
->next
->base
- word
->base
) <
5196 maxIntraLineDelta
* word
->fontSize
&&
5198 word
->xMax
- minDupBreakOverlap
* word
->fontSize
) {
5199 if (word
->next
->xMin
> word
->xMax
+ minWordSpacing
* word
->fontSize
) {
5200 (*outputFunc
)(outputStream
, space
, spaceLen
);
5203 (*outputFunc
)(outputStream
, eol
, eolLen
);
5207 // output the page, maintaining the original physical layout
5208 } else if (physLayout
) {
5210 // collect the line fragments for the page and sort them
5212 frags
= (TextLineFrag
*)gmallocn(fragsSize
, sizeof(TextLineFrag
));
5214 for (i
= 0; i
< nBlocks
; ++i
) {
5216 for (line
= blk
->lines
; line
; line
= line
->next
) {
5217 if (nFrags
== fragsSize
) {
5219 frags
= (TextLineFrag
*)greallocn(frags
,
5220 fragsSize
, sizeof(TextLineFrag
));
5222 frags
[nFrags
].init(line
, 0, line
->len
);
5223 frags
[nFrags
].computeCoords(gTrue
);
5227 qsort(frags
, nFrags
, sizeof(TextLineFrag
), &TextLineFrag::cmpYXPrimaryRot
);
5229 while (i
< nFrags
) {
5230 delta
= maxIntraLineDelta
* frags
[i
].line
->words
->fontSize
;
5232 j
< nFrags
&& fabs(frags
[j
].base
- frags
[i
].base
) < delta
;
5234 qsort(frags
+ i
, j
- i
, sizeof(TextLineFrag
),
5235 &TextLineFrag::cmpXYColumnPrimaryRot
);
5239 #if 0 // for debugging
5240 printf("*** line fragments ***\n");
5241 for (i
= 0; i
< nFrags
; ++i
) {
5243 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
5244 frag
->xMin
, frag
->xMax
, frag
->yMin
, frag
->yMax
, frag
->base
);
5245 for (n
= 0; n
< frag
->len
; ++n
) {
5246 fputc(frag
->line
->text
[frag
->start
+ n
] & 0xff, stdout
);
5255 for (i
= 0; i
< nFrags
; ++i
) {
5259 for (; col
< frag
->col
; ++col
) {
5260 (*outputFunc
)(outputStream
, space
, spaceLen
);
5264 s
= new GooString();
5265 col
+= dumpFragment(frag
->line
->text
+ frag
->start
, frag
->len
, uMap
, s
);
5266 (*outputFunc
)(outputStream
, s
->getCString(), s
->getLength());
5269 // print one or more returns if necessary
5270 if (i
== nFrags
- 1 ||
5271 frags
[i
+1].col
< col
||
5272 fabs(frags
[i
+1].base
- frag
->base
) >
5273 maxIntraLineDelta
* frag
->line
->words
->fontSize
) {
5274 if (i
< nFrags
- 1) {
5275 d
= (int)((frags
[i
+1].base
- frag
->base
) /
5276 frag
->line
->words
->fontSize
);
5285 for (; d
> 0; --d
) {
5286 (*outputFunc
)(outputStream
, eol
, eolLen
);
5294 // output the page, "undoing" the layout
5296 for (flow
= flows
; flow
; flow
= flow
->next
) {
5297 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
5298 for (line
= blk
->lines
; line
; line
= line
->next
) {
5300 if (line
->hyphenated
&& (line
->next
|| blk
->next
)) {
5303 s
= new GooString();
5304 dumpFragment(line
->text
, n
, uMap
, s
);
5305 (*outputFunc
)(outputStream
, s
->getCString(), s
->getLength());
5307 // output a newline when a hyphen is not suppressed
5308 if (n
== line
->len
) {
5309 (*outputFunc
)(outputStream
, eol
, eolLen
);
5313 (*outputFunc
)(outputStream
, eol
, eolLen
);
5319 (*outputFunc
)(outputStream
, eop
, eopLen
);
5325 void TextPage::setMergeCombining(GBool merge
) {
5326 mergeCombining
= merge
;
5329 void TextPage::assignColumns(TextLineFrag
*frags
, int nFrags
, GBool oneRot
) {
5330 TextLineFrag
*frag0
, *frag1
;
5331 int rot
, col1
, col2
, i
, j
, k
;
5333 // all text in the region has the same rotation -- recompute the
5334 // column numbers based only on the text in the region
5336 qsort(frags
, nFrags
, sizeof(TextLineFrag
), &TextLineFrag::cmpXYLineRot
);
5337 rot
= frags
[0].line
->rot
;
5338 for (i
= 0; i
< nFrags
; ++i
) {
5341 for (j
= 0; j
< i
; ++j
) {
5343 col2
= 0; // make gcc happy
5346 if (frag0
->xMin
>= frag1
->xMax
) {
5347 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
5348 frag1
->line
->col
[frag1
->start
]) + 1;
5350 for (k
= frag1
->start
;
5351 k
< frag1
->start
+ frag1
->len
&&
5352 frag0
->xMin
>= 0.5 * (frag1
->line
->edge
[k
] +
5353 frag1
->line
->edge
[k
+1]);
5356 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
5360 if (frag0
->yMin
>= frag1
->yMax
) {
5361 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
5362 frag1
->line
->col
[frag1
->start
]) + 1;
5364 for (k
= frag1
->start
;
5365 k
< frag1
->start
+ frag1
->len
&&
5366 frag0
->yMin
>= 0.5 * (frag1
->line
->edge
[k
] +
5367 frag1
->line
->edge
[k
+1]);
5370 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
5374 if (frag0
->xMax
<= frag1
->xMin
) {
5375 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
5376 frag1
->line
->col
[frag1
->start
]) + 1;
5378 for (k
= frag1
->start
;
5379 k
< frag1
->start
+ frag1
->len
&&
5380 frag0
->xMax
<= 0.5 * (frag1
->line
->edge
[k
] +
5381 frag1
->line
->edge
[k
+1]);
5384 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
5388 if (frag0
->yMax
<= frag1
->yMin
) {
5389 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
5390 frag1
->line
->col
[frag1
->start
]) + 1;
5392 for (k
= frag1
->start
;
5393 k
< frag1
->start
+ frag1
->len
&&
5394 frag0
->yMax
<= 0.5 * (frag1
->line
->edge
[k
] +
5395 frag1
->line
->edge
[k
+1]);
5398 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
5409 // the region includes text at different rotations -- use the
5410 // globally assigned column numbers, offset by the minimum column
5411 // number (i.e., shift everything over to column 0)
5413 col1
= frags
[0].col
;
5414 for (i
= 1; i
< nFrags
; ++i
) {
5415 if (frags
[i
].col
< col1
) {
5416 col1
= frags
[i
].col
;
5419 for (i
= 0; i
< nFrags
; ++i
) {
5420 frags
[i
].col
-= col1
;
5425 int TextPage::dumpFragment(Unicode
*text
, int len
, UnicodeMap
*uMap
,
5427 if (uMap
->isUnicode()) {
5428 return reorderText(text
, len
, uMap
, primaryLR
, s
, NULL
);
5435 for (int i
= 0; i
< len
; ++i
) {
5436 buflen
= uMap
->mapUnicode(text
[i
], buf
, sizeof(buf
));
5437 s
->append(buf
, buflen
);
5445 #if TEXTOUT_WORD_LIST
5446 TextWordList
*TextPage::makeWordList(GBool physLayout
) {
5447 return new TextWordList(this, physLayout
);
5451 //------------------------------------------------------------------------
5453 //------------------------------------------------------------------------
5454 ActualText::ActualText(TextPage
*out
) {
5458 actualTextNBytes
= 0;
5461 ActualText::~ActualText() {
5467 void ActualText::addChar(GfxState
*state
, double x
, double y
,
5468 double dx
, double dy
,
5469 CharCode c
, int nBytes
, Unicode
*u
, int uLen
) {
5471 text
->addChar(state
, x
, y
, dx
, dy
, c
, nBytes
, u
, uLen
);
5475 // Inside ActualText span.
5476 if (!actualTextNBytes
) {
5480 actualTextX1
= x
+ dx
;
5481 actualTextY1
= y
+ dy
;
5482 actualTextNBytes
+= nBytes
;
5485 void ActualText::begin(GfxState
*state
, GooString
*text
) {
5488 actualText
= new GooString(text
);
5489 actualTextNBytes
= 0;
5492 void ActualText::end(GfxState
*state
) {
5493 // ActualText span closed. Output the span text and the
5494 // extents of all the glyphs inside the span
5496 if (actualTextNBytes
) {
5497 Unicode
*uni
= NULL
;
5500 // now that we have the position info for all of the text inside
5501 // the marked content span, we feed the "ActualText" back through
5503 length
= TextStringToUCS4(actualText
, &uni
);
5504 text
->addChar(state
, actualTextX0
, actualTextY0
,
5505 actualTextX1
- actualTextX0
, actualTextY1
- actualTextY0
,
5506 0, actualTextNBytes
, uni
, length
);
5512 actualTextNBytes
= 0;
5515 //------------------------------------------------------------------------
5517 //------------------------------------------------------------------------
5519 static void TextOutputDev_outputToFile(void *stream
, const char *text
, int len
) {
5520 fwrite(text
, 1, len
, (FILE *)stream
);
5523 TextOutputDev::TextOutputDev(char *fileName
, GBool physLayoutA
,
5524 double fixedPitchA
, GBool rawOrderA
,
5527 physLayout
= physLayoutA
;
5528 fixedPitch
= physLayout
? fixedPitchA
: 0;
5529 rawOrder
= rawOrderA
;
5536 if (!strcmp(fileName
, "-")) {
5537 outputStream
= stdout
;
5539 // keep DOS from munging the end-of-line characters
5540 setmode(fileno(stdout
), O_BINARY
);
5542 } else if ((outputStream
= fopen(fileName
, append
? "ab" : "wb"))) {
5545 error(errIO
, -1, "Couldn't open text file '{0:s}'", fileName
);
5550 outputFunc
= &TextOutputDev_outputToFile
;
5552 outputStream
= NULL
;
5555 // set up text object
5556 text
= new TextPage(rawOrderA
);
5557 actualText
= new ActualText(text
);
5560 TextOutputDev::TextOutputDev(TextOutputFunc func
, void *stream
,
5561 GBool physLayoutA
, double fixedPitchA
,
5564 outputStream
= stream
;
5566 physLayout
= physLayoutA
;
5567 fixedPitch
= physLayout
? fixedPitchA
: 0;
5568 rawOrder
= rawOrderA
;
5570 text
= new TextPage(rawOrderA
);
5571 actualText
= new ActualText(text
);
5575 TextOutputDev::~TextOutputDev() {
5578 ICS_MapRefNumAndAssign((short)((FILE *)outputStream
)->handle
);
5580 fclose((FILE *)outputStream
);
5588 void TextOutputDev::startPage(int pageNum
, GfxState
*state
, XRef
*xref
) {
5589 text
->startPage(state
);
5592 void TextOutputDev::endPage() {
5594 text
->coalesce(physLayout
, fixedPitch
, doHTML
);
5596 text
->dump(outputStream
, outputFunc
, physLayout
);
5600 void TextOutputDev::restoreState(GfxState
*state
) {
5601 text
->updateFont(state
);
5604 void TextOutputDev::updateFont(GfxState
*state
) {
5605 text
->updateFont(state
);
5608 void TextOutputDev::beginString(GfxState
*state
, GooString
*s
) {
5611 void TextOutputDev::endString(GfxState
*state
) {
5614 void TextOutputDev::drawChar(GfxState
*state
, double x
, double y
,
5615 double dx
, double dy
,
5616 double originX
, double originY
,
5617 CharCode c
, int nBytes
, Unicode
*u
, int uLen
) {
5618 actualText
->addChar(state
, x
, y
, dx
, dy
, c
, nBytes
, u
, uLen
);
5621 void TextOutputDev::incCharCount(int nChars
) {
5622 text
->incCharCount(nChars
);
5625 void TextOutputDev::beginActualText(GfxState
*state
, GooString
*text
)
5627 actualText
->begin(state
, text
);
5630 void TextOutputDev::endActualText(GfxState
*state
)
5632 actualText
->end(state
);
5635 void TextOutputDev::stroke(GfxState
*state
) {
5637 GfxSubpath
*subpath
;
5643 path
= state
->getPath();
5644 if (path
->getNumSubpaths() != 1) {
5647 subpath
= path
->getSubpath(0);
5648 if (subpath
->getNumPoints() != 2) {
5651 state
->transform(subpath
->getX(0), subpath
->getY(0), &x
[0], &y
[0]);
5652 state
->transform(subpath
->getX(1), subpath
->getY(1), &x
[1], &y
[1]);
5654 // look for a vertical or horizontal line
5655 if (x
[0] == x
[1] || y
[0] == y
[1]) {
5656 text
->addUnderline(x
[0], y
[0], x
[1], y
[1]);
5660 void TextOutputDev::fill(GfxState
*state
) {
5662 GfxSubpath
*subpath
;
5664 double rx0
, ry0
, rx1
, ry1
, t
;
5670 path
= state
->getPath();
5671 if (path
->getNumSubpaths() != 1) {
5674 subpath
= path
->getSubpath(0);
5675 if (subpath
->getNumPoints() != 5) {
5678 for (i
= 0; i
< 5; ++i
) {
5679 if (subpath
->getCurve(i
)) {
5682 state
->transform(subpath
->getX(i
), subpath
->getY(i
), &x
[i
], &y
[i
]);
5685 // look for a rectangle
5686 if (x
[0] == x
[1] && y
[1] == y
[2] && x
[2] == x
[3] && y
[3] == y
[4] &&
5687 x
[0] == x
[4] && y
[0] == y
[4]) {
5692 } else if (y
[0] == y
[1] && x
[1] == x
[2] && y
[2] == y
[3] && x
[3] == x
[4] &&
5693 x
[0] == x
[4] && y
[0] == y
[4]) {
5712 // skinny horizontal rectangle
5713 if (ry1
- ry0
< rx1
- rx0
) {
5714 if (ry1
- ry0
< maxUnderlineWidth
) {
5715 ry0
= 0.5 * (ry0
+ ry1
);
5716 text
->addUnderline(rx0
, ry0
, rx1
, ry0
);
5719 // skinny vertical rectangle
5721 if (rx1
- rx0
< maxUnderlineWidth
) {
5722 rx0
= 0.5 * (rx0
+ rx1
);
5723 text
->addUnderline(rx0
, ry0
, rx0
, ry1
);
5728 void TextOutputDev::eoFill(GfxState
*state
) {
5735 void TextOutputDev::processLink(AnnotLink
*link
) {
5736 double x1
, y1
, x2
, y2
;
5737 int xMin
, yMin
, xMax
, yMax
, x
, y
;
5742 link
->getRect(&x1
, &y1
, &x2
, &y2
);
5743 cvtUserToDev(x1
, y1
, &x
, &y
);
5746 cvtUserToDev(x1
, y2
, &x
, &y
);
5749 } else if (x
> xMax
) {
5754 } else if (y
> yMax
) {
5757 cvtUserToDev(x2
, y1
, &x
, &y
);
5760 } else if (x
> xMax
) {
5765 } else if (y
> yMax
) {
5768 cvtUserToDev(x2
, y2
, &x
, &y
);
5771 } else if (x
> xMax
) {
5776 } else if (y
> yMax
) {
5779 text
->addLink(xMin
, yMin
, xMax
, yMax
, link
);
5782 GBool
TextOutputDev::findText(Unicode
*s
, int len
,
5783 GBool startAtTop
, GBool stopAtBottom
,
5784 GBool startAtLast
, GBool stopAtLast
,
5785 GBool caseSensitive
, GBool backward
,
5787 double *xMin
, double *yMin
,
5788 double *xMax
, double *yMax
) {
5789 return text
->findText(s
, len
, startAtTop
, stopAtBottom
,
5790 startAtLast
, stopAtLast
,
5791 caseSensitive
, backward
, wholeWord
,
5792 xMin
, yMin
, xMax
, yMax
);
5795 GooString
*TextOutputDev::getText(double xMin
, double yMin
,
5796 double xMax
, double yMax
) {
5797 return text
->getText(xMin
, yMin
, xMax
, yMax
);
5800 void TextOutputDev::drawSelection(OutputDev
*out
,
5803 PDFRectangle
*selection
,
5804 SelectionStyle style
,
5805 GfxColor
*glyph_color
, GfxColor
*box_color
) {
5806 text
->drawSelection(out
, scale
, rotation
, selection
, style
, glyph_color
, box_color
);
5809 GooList
*TextOutputDev::getSelectionRegion(PDFRectangle
*selection
,
5810 SelectionStyle style
,
5812 return text
->getSelectionRegion(selection
, style
, scale
);
5815 GooString
*TextOutputDev::getSelectionText(PDFRectangle
*selection
,
5816 SelectionStyle style
)
5818 return text
->getSelectionText(selection
, style
);
5821 GBool
TextOutputDev::findCharRange(int pos
, int length
,
5822 double *xMin
, double *yMin
,
5823 double *xMax
, double *yMax
) {
5824 return text
->findCharRange(pos
, length
, xMin
, yMin
, xMax
, yMax
);
5827 void TextOutputDev::setMergeCombining(GBool merge
) {
5828 text
->setMergeCombining(merge
);
5831 #if TEXTOUT_WORD_LIST
5832 TextWordList
*TextOutputDev::makeWordList() {
5833 return text
->makeWordList(physLayout
);
5837 TextPage
*TextOutputDev::takeText() {
5841 text
= new TextPage(rawOrder
);