1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h> // for setmode
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
34 #include "TextOutputDev.h"
37 // needed for setting type/creator of MacOS files
38 #include "ICSupport.h"
41 //------------------------------------------------------------------------
43 //------------------------------------------------------------------------
45 // Each bucket in a text pool includes baselines within a range of
47 #define textPoolStep 4
49 // Inter-character space width which will cause addChar to start a new
51 #define minWordBreakSpace 0.1
53 // Negative inter-character space width, i.e., overlap, which will
54 // cause addChar to start a new word.
55 #define minDupBreakOverlap 0.2
57 // Max distance between baselines of two lines within a block, as a
58 // fraction of the font size.
59 #define maxLineSpacingDelta 1.5
61 // Max difference in primary font sizes on two lines in the same
62 // block. Delta1 is used when examining new lines above and below the
63 // current block; delta2 is used when examining text that overlaps the
64 // current block; delta3 is used when examining text to the left and
65 // right of the current block.
66 #define maxBlockFontSizeDelta1 0.05
67 #define maxBlockFontSizeDelta2 0.6
68 #define maxBlockFontSizeDelta3 0.2
70 // Max difference in font sizes inside a word.
71 #define maxWordFontSizeDelta 0.05
73 // Maximum distance between baselines of two words on the same line,
74 // e.g., distance between subscript or superscript and the primary
75 // baseline, as a fraction of the font size.
76 #define maxIntraLineDelta 0.5
78 // Minimum inter-word spacing, as a fraction of the font size. (Only
79 // used for raw ordering.)
80 #define minWordSpacing 0.15
82 // Maximum inter-word spacing, as a fraction of the font size.
83 #define maxWordSpacing 1.5
85 // Maximum horizontal spacing which will allow a word to be pulled
87 #define minColSpacing1 0.3
89 // Minimum spacing between columns, as a fraction of the font size.
90 #define minColSpacing2 1.0
92 // Maximum vertical spacing between blocks within a flow, as a
93 // multiple of the font size.
94 #define maxBlockSpacing 2.5
96 // Minimum spacing between characters within a word, as a fraction of
98 #define minCharSpacing -0.2
100 // Maximum spacing between characters within a word, as a fraction of
101 // the font size, when there is no obvious extra-wide character
103 #define maxCharSpacing 0.03
105 // When extra-wide character spacing is detected, the inter-character
106 // space threshold is set to the minimum inter-character space
107 // multiplied by this constant.
108 #define maxWideCharSpacingMul 1.3
110 // Upper limit on spacing between characters in a word.
111 #define maxWideCharSpacing 0.4
113 // Max difference in primary,secondary coordinates (as a fraction of
114 // the font size) allowed for duplicated text (fake boldface, drop
115 // shadows) which is to be discarded.
116 #define dupMaxPriDelta 0.1
117 #define dupMaxSecDelta 0.2
119 // Max width of underlines (in points).
120 #define maxUnderlineWidth 3
122 // Min distance between baseline and underline (in points).
123 //~ this should be font-size-dependent
124 #define minUnderlineGap -2
126 // Max distance between baseline and underline (in points).
127 //~ this should be font-size-dependent
128 #define maxUnderlineGap 4
130 // Max horizontal distance between edge of word and start of underline
132 //~ this should be font-size-dependent
133 #define underlineSlack 1
135 // Max distance between edge of text and edge of link border
136 #define hyperlinkSlack 2
138 //------------------------------------------------------------------------
140 //------------------------------------------------------------------------
142 class TextUnderline
{
145 TextUnderline(double x0A
, double y0A
, double x1A
, double y1A
)
146 { x0
= x0A
; y0
= y0A
; x1
= x1A
; y1
= y1A
; horiz
= y0
== y1
; }
149 double x0
, y0
, x1
, y1
;
153 //------------------------------------------------------------------------
155 //------------------------------------------------------------------------
160 TextLink(int xMinA
, int yMinA
, int xMaxA
, int yMaxA
, Link
*linkA
)
161 { xMin
= xMinA
; yMin
= yMinA
; xMax
= xMaxA
; yMax
= yMaxA
; link
= linkA
; }
164 int xMin
, yMin
, xMax
, yMax
;
168 //------------------------------------------------------------------------
170 //------------------------------------------------------------------------
172 TextFontInfo::TextFontInfo(GfxState
*state
) {
173 gfxFont
= state
->getFont();
174 #if TEXTOUT_WORD_LIST
175 fontName
= (gfxFont
&& gfxFont
->getOrigName())
176 ? gfxFont
->getOrigName()->copy()
178 flags
= gfxFont
? gfxFont
->getFlags() : 0;
182 TextFontInfo::~TextFontInfo() {
183 #if TEXTOUT_WORD_LIST
190 GBool
TextFontInfo::matches(GfxState
*state
) {
191 return state
->getFont() == gfxFont
;
194 //------------------------------------------------------------------------
196 //------------------------------------------------------------------------
198 TextWord::TextWord(GfxState
*state
, int rotA
, double x0
, double y0
,
199 int charPosA
, TextFontInfo
*fontA
, double fontSizeA
) {
201 double x
, y
, ascent
, descent
;
207 fontSize
= fontSizeA
;
208 state
->transform(x0
, y0
, &x
, &y
);
209 if ((gfxFont
= font
->gfxFont
)) {
210 ascent
= gfxFont
->getAscent() * fontSize
;
211 descent
= gfxFont
->getDescent() * fontSize
;
213 // this means that the PDF file draws text without a current font,
214 // which should never happen
215 ascent
= 0.95 * fontSize
;
216 descent
= -0.35 * fontSize
;
223 // this is a sanity check for a case that shouldn't happen -- but
224 // if it does happen, we want to avoid dividing by zero later
234 // this is a sanity check for a case that shouldn't happen -- but
235 // if it does happen, we want to avoid dividing by zero later
245 // this is a sanity check for a case that shouldn't happen -- but
246 // if it does happen, we want to avoid dividing by zero later
256 // this is a sanity check for a case that shouldn't happen -- but
257 // if it does happen, we want to avoid dividing by zero later
270 #if TEXTOUT_WORD_LIST
273 if ((state
->getRender() & 3) == 1) {
274 state
->getStrokeRGB(&rgb
);
276 state
->getFillRGB(&rgb
);
278 colorR
= colToDbl(rgb
.r
);
279 colorG
= colToDbl(rgb
.g
);
280 colorB
= colToDbl(rgb
.b
);
287 TextWord::~TextWord() {
292 void TextWord::addChar(GfxState
*state
, double x
, double y
,
293 double dx
, double dy
, Unicode u
) {
296 text
= (Unicode
*)greallocn(text
, size
, sizeof(Unicode
));
297 edge
= (double *)greallocn(edge
, size
+ 1, sizeof(double));
306 xMax
= edge
[len
+1] = x
+ dx
;
313 yMax
= edge
[len
+1] = y
+ dy
;
320 xMin
= edge
[len
+1] = x
+ dx
;
327 yMin
= edge
[len
+1] = y
+ dy
;
333 void TextWord::merge(TextWord
*word
) {
336 if (word
->xMin
< xMin
) {
339 if (word
->yMin
< yMin
) {
342 if (word
->xMax
> xMax
) {
345 if (word
->yMax
> yMax
) {
348 if (len
+ word
->len
> size
) {
349 size
= len
+ word
->len
;
350 text
= (Unicode
*)greallocn(text
, size
, sizeof(Unicode
));
351 edge
= (double *)greallocn(edge
, size
+ 1, sizeof(double));
353 for (i
= 0; i
< word
->len
; ++i
) {
354 text
[len
+ i
] = word
->text
[i
];
355 edge
[len
+ i
] = word
->edge
[i
];
357 edge
[len
+ word
->len
] = word
->edge
[word
->len
];
359 charLen
+= word
->charLen
;
362 inline int TextWord::primaryCmp(TextWord
*word
) {
365 cmp
= 0; // make gcc happy
368 cmp
= xMin
- word
->xMin
;
371 cmp
= yMin
- word
->yMin
;
374 cmp
= word
->xMax
- xMax
;
377 cmp
= word
->yMax
- yMax
;
380 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
383 double TextWord::primaryDelta(TextWord
*word
) {
386 delta
= 0; // make gcc happy
389 delta
= word
->xMin
- xMax
;
392 delta
= word
->yMin
- yMax
;
395 delta
= xMin
- word
->xMax
;
398 delta
= yMin
- word
->yMax
;
404 int TextWord::cmpYX(const void *p1
, const void *p2
) {
405 TextWord
*word1
= *(TextWord
**)p1
;
406 TextWord
*word2
= *(TextWord
**)p2
;
409 cmp
= word1
->yMin
- word2
->yMin
;
411 cmp
= word1
->xMin
- word2
->xMin
;
413 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
416 #if TEXTOUT_WORD_LIST
418 GString
*TextWord::getText() {
425 if (!(uMap
= globalParams
->getTextEncoding())) {
428 for (i
= 0; i
< len
; ++i
) {
429 n
= uMap
->mapUnicode(text
[i
], buf
, sizeof(buf
));
436 void TextWord::getCharBBox(int charIdx
, double *xMinA
, double *yMinA
,
437 double *xMaxA
, double *yMaxA
) {
438 if (charIdx
< 0 || charIdx
>= len
) {
443 *xMinA
= edge
[charIdx
];
444 *xMaxA
= edge
[charIdx
+ 1];
451 *yMinA
= edge
[charIdx
];
452 *yMaxA
= edge
[charIdx
+ 1];
455 *xMinA
= edge
[charIdx
+ 1];
456 *xMaxA
= edge
[charIdx
];
463 *yMinA
= edge
[charIdx
+ 1];
464 *yMaxA
= edge
[charIdx
];
469 #endif // TEXTOUT_WORD_LIST
471 //------------------------------------------------------------------------
473 //------------------------------------------------------------------------
475 TextPool::TextPool() {
483 TextPool::~TextPool() {
485 TextWord
*word
, *word2
;
487 for (baseIdx
= minBaseIdx
; baseIdx
<= maxBaseIdx
; ++baseIdx
) {
488 for (word
= pool
[baseIdx
- minBaseIdx
]; word
; word
= word2
) {
496 int TextPool::getBaseIdx(double base
) {
499 baseIdx
= (int)(base
/ textPoolStep
);
500 if (baseIdx
< minBaseIdx
) {
503 if (baseIdx
> maxBaseIdx
) {
509 void TextPool::addWord(TextWord
*word
) {
511 int wordBaseIdx
, newMinBaseIdx
, newMaxBaseIdx
, baseIdx
;
514 // expand the array if needed
515 wordBaseIdx
= (int)(word
->base
/ textPoolStep
);
516 if (minBaseIdx
> maxBaseIdx
) {
517 minBaseIdx
= wordBaseIdx
- 128;
518 maxBaseIdx
= wordBaseIdx
+ 128;
519 pool
= (TextWord
**)gmallocn(maxBaseIdx
- minBaseIdx
+ 1,
521 for (baseIdx
= minBaseIdx
; baseIdx
<= maxBaseIdx
; ++baseIdx
) {
522 pool
[baseIdx
- minBaseIdx
] = NULL
;
524 } else if (wordBaseIdx
< minBaseIdx
) {
525 newMinBaseIdx
= wordBaseIdx
- 128;
526 newPool
= (TextWord
**)gmallocn(maxBaseIdx
- newMinBaseIdx
+ 1,
528 for (baseIdx
= newMinBaseIdx
; baseIdx
< minBaseIdx
; ++baseIdx
) {
529 newPool
[baseIdx
- newMinBaseIdx
] = NULL
;
531 memcpy(&newPool
[minBaseIdx
- newMinBaseIdx
], pool
,
532 (maxBaseIdx
- minBaseIdx
+ 1) * sizeof(TextWord
*));
535 minBaseIdx
= newMinBaseIdx
;
536 } else if (wordBaseIdx
> maxBaseIdx
) {
537 newMaxBaseIdx
= wordBaseIdx
+ 128;
538 pool
= (TextWord
**)greallocn(pool
, newMaxBaseIdx
- minBaseIdx
+ 1,
540 for (baseIdx
= maxBaseIdx
+ 1; baseIdx
<= newMaxBaseIdx
; ++baseIdx
) {
541 pool
[baseIdx
- minBaseIdx
] = NULL
;
543 maxBaseIdx
= newMaxBaseIdx
;
546 // insert the new word
547 if (cursor
&& wordBaseIdx
== cursorBaseIdx
&&
548 word
->primaryCmp(cursor
) > 0) {
553 w1
= pool
[wordBaseIdx
- minBaseIdx
];
555 for (; w1
&& word
->primaryCmp(w1
) > 0; w0
= w1
, w1
= w1
->next
) ;
560 pool
[wordBaseIdx
- minBaseIdx
] = word
;
563 cursorBaseIdx
= wordBaseIdx
;
566 //------------------------------------------------------------------------
568 //------------------------------------------------------------------------
570 TextLine::TextLine(TextBlock
*blkA
, int rotA
, double baseA
) {
576 words
= lastWord
= NULL
;
586 TextLine::~TextLine() {
599 void TextLine::addWord(TextWord
*word
) {
601 lastWord
->next
= word
;
613 if (word
->xMin
< xMin
) {
616 if (word
->xMax
> xMax
) {
619 if (word
->yMin
< yMin
) {
622 if (word
->yMax
> yMax
) {
628 double TextLine::primaryDelta(TextLine
*line
) {
631 delta
= 0; // make gcc happy
634 delta
= line
->xMin
- xMax
;
637 delta
= line
->yMin
- yMax
;
640 delta
= xMin
- line
->xMax
;
643 delta
= yMin
- line
->yMax
;
649 int TextLine::primaryCmp(TextLine
*line
) {
652 cmp
= 0; // make gcc happy
655 cmp
= xMin
- line
->xMin
;
658 cmp
= yMin
- line
->yMin
;
661 cmp
= line
->xMax
- xMax
;
664 cmp
= line
->yMax
- yMax
;
667 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
670 int TextLine::secondaryCmp(TextLine
*line
) {
673 cmp
= (rot
== 0 || rot
== 3) ? base
- line
->base
: line
->base
- base
;
674 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
677 int TextLine::cmpYX(TextLine
*line
) {
680 if ((cmp
= secondaryCmp(line
))) {
683 return primaryCmp(line
);
686 int TextLine::cmpXY(const void *p1
, const void *p2
) {
687 TextLine
*line1
= *(TextLine
**)p1
;
688 TextLine
*line2
= *(TextLine
**)p2
;
691 if ((cmp
= line1
->primaryCmp(line2
))) {
694 return line1
->secondaryCmp(line2
);
697 void TextLine::coalesce(UnicodeMap
*uMap
) {
698 TextWord
*word0
, *word1
;
699 double space
, delta
, minSpace
;
706 // compute the inter-word space threshold
707 if (words
->len
> 1 || words
->next
->len
> 1) {
710 minSpace
= words
->primaryDelta(words
->next
);
711 for (word0
= words
->next
, word1
= word0
->next
;
712 word1
&& minSpace
> 0;
713 word0
= word1
, word1
= word0
->next
) {
714 if (word1
->len
> 1) {
717 delta
= word0
->primaryDelta(word1
);
718 if (delta
< minSpace
) {
724 space
= maxCharSpacing
* words
->fontSize
;
726 space
= maxWideCharSpacingMul
* minSpace
;
727 if (space
> maxWideCharSpacing
* words
->fontSize
) {
728 space
= maxWideCharSpacing
* words
->fontSize
;
736 if (word0
->primaryDelta(word1
) >= space
) {
737 word0
->spaceAfter
= gTrue
;
740 } else if (word0
->font
== word1
->font
&&
741 word0
->underlined
== word1
->underlined
&&
742 fabs(word0
->fontSize
- word1
->fontSize
) <
743 maxWordFontSizeDelta
* words
->fontSize
&&
744 word1
->charPos
== word0
->charPos
+ word0
->charLen
) {
746 word0
->next
= word1
->next
;
756 // build the line text
757 isUnicode
= uMap
? uMap
->isUnicode() : gFalse
;
759 for (word1
= words
; word1
; word1
= word1
->next
) {
761 if (word1
->spaceAfter
) {
765 text
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
766 edge
= (double *)gmallocn(len
+ 1, sizeof(double));
768 for (word1
= words
; word1
; word1
= word1
->next
) {
769 for (j
= 0; j
< word1
->len
; ++j
) {
770 text
[i
] = word1
->text
[j
];
771 edge
[i
] = word1
->edge
[j
];
774 edge
[i
] = word1
->edge
[word1
->len
];
775 if (word1
->spaceAfter
) {
776 text
[i
] = (Unicode
)0x0020;
781 // compute convertedLen and set up the col array
782 col
= (int *)gmallocn(len
+ 1, sizeof(int));
784 for (i
= 0; i
< len
; ++i
) {
785 col
[i
] = convertedLen
;
789 convertedLen
+= uMap
->mapUnicode(text
[i
], buf
, sizeof(buf
));
792 col
[len
] = convertedLen
;
794 // check for hyphen at end of line
795 //~ need to check for other chars used as hyphens
796 hyphenated
= text
[len
- 1] == (Unicode
)'-';
799 //------------------------------------------------------------------------
801 //------------------------------------------------------------------------
806 TextLine
*line
; // the line xObject
807 int start
, len
; // offset and length of this fragment
808 // (in Unicode chars)
809 double xMin
, xMax
; // bounding box coordinates
811 double base
; // baseline virtual coordinate
812 int col
; // first column
814 void init(TextLine
*lineA
, int startA
, int lenA
);
815 void computeCoords(GBool oneRot
);
817 static int cmpYXPrimaryRot(const void *p1
, const void *p2
);
818 static int cmpYXLineRot(const void *p1
, const void *p2
);
819 static int cmpXYLineRot(const void *p1
, const void *p2
);
820 static int cmpXYColumnPrimaryRot(const void *p1
, const void *p2
);
821 static int cmpXYColumnLineRot(const void *p1
, const void *p2
);
824 void TextLineFrag::init(TextLine
*lineA
, int startA
, int lenA
) {
828 col
= line
->col
[start
];
831 void TextLineFrag::computeCoords(GBool oneRot
) {
833 double d0
, d1
, d2
, d3
, d4
;
839 xMin
= line
->edge
[start
];
840 xMax
= line
->edge
[start
+ len
];
847 yMin
= line
->edge
[start
];
848 yMax
= line
->edge
[start
+ len
];
851 xMin
= line
->edge
[start
+ len
];
852 xMax
= line
->edge
[start
];
859 yMin
= line
->edge
[start
+ len
];
860 yMax
= line
->edge
[start
];
867 if (line
->rot
== 0 && line
->blk
->page
->primaryRot
== 0) {
869 xMin
= line
->edge
[start
];
870 xMax
= line
->edge
[start
+ len
];
878 d0
= line
->edge
[start
];
879 d1
= line
->edge
[start
+ len
];
880 d2
= d3
= d4
= 0; // make gcc happy
887 d0
= (d0
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
888 d1
= (d1
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
889 d2
= (d2
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
890 d3
= (d3
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
891 d4
= (d4
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
897 d0
= (d0
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
898 d1
= (d1
- blk
->yMin
) / (blk
->yMax
- blk
->yMin
);
899 d2
= (blk
->xMax
- d2
) / (blk
->xMax
- blk
->xMin
);
900 d3
= (blk
->xMax
- d3
) / (blk
->xMax
- blk
->xMin
);
901 d4
= (blk
->xMax
- d4
) / (blk
->xMax
- blk
->xMin
);
907 d0
= (blk
->xMax
- d0
) / (blk
->xMax
- blk
->xMin
);
908 d1
= (blk
->xMax
- d1
) / (blk
->xMax
- blk
->xMin
);
909 d2
= (blk
->yMax
- d2
) / (blk
->yMax
- blk
->yMin
);
910 d3
= (blk
->yMax
- d3
) / (blk
->yMax
- blk
->yMin
);
911 d4
= (blk
->yMax
- d4
) / (blk
->yMax
- blk
->yMin
);
917 d0
= (blk
->yMax
- d0
) / (blk
->yMax
- blk
->yMin
);
918 d1
= (blk
->yMax
- d1
) / (blk
->yMax
- blk
->yMin
);
919 d2
= (d2
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
920 d3
= (d3
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
921 d4
= (d4
- blk
->xMin
) / (blk
->xMax
- blk
->xMin
);
925 switch (line
->blk
->page
->primaryRot
) {
927 xMin
= blk
->xMin
+ d0
* (blk
->xMax
- blk
->xMin
);
928 xMax
= blk
->xMin
+ d1
* (blk
->xMax
- blk
->xMin
);
929 yMin
= blk
->yMin
+ d2
* (blk
->yMax
- blk
->yMin
);
930 yMax
= blk
->yMin
+ d3
* (blk
->yMax
- blk
->yMin
);
931 base
= blk
->yMin
+ base
* (blk
->yMax
- blk
->yMin
);
934 xMin
= blk
->xMax
- d3
* (blk
->xMax
- blk
->xMin
);
935 xMax
= blk
->xMax
- d2
* (blk
->xMax
- blk
->xMin
);
936 yMin
= blk
->yMin
+ d0
* (blk
->yMax
- blk
->yMin
);
937 yMax
= blk
->yMin
+ d1
* (blk
->yMax
- blk
->yMin
);
938 base
= blk
->xMax
- d4
* (blk
->xMax
- blk
->xMin
);
941 xMin
= blk
->xMax
- d1
* (blk
->xMax
- blk
->xMin
);
942 xMax
= blk
->xMax
- d0
* (blk
->xMax
- blk
->xMin
);
943 yMin
= blk
->yMax
- d3
* (blk
->yMax
- blk
->yMin
);
944 yMax
= blk
->yMax
- d2
* (blk
->yMax
- blk
->yMin
);
945 base
= blk
->yMax
- d4
* (blk
->yMax
- blk
->yMin
);
948 xMin
= blk
->xMin
+ d2
* (blk
->xMax
- blk
->xMin
);
949 xMax
= blk
->xMin
+ d3
* (blk
->xMax
- blk
->xMin
);
950 yMin
= blk
->yMax
- d1
* (blk
->yMax
- blk
->yMin
);
951 yMax
= blk
->yMax
- d0
* (blk
->yMax
- blk
->yMin
);
952 base
= blk
->xMin
+ d4
* (blk
->xMax
- blk
->xMin
);
960 int TextLineFrag::cmpYXPrimaryRot(const void *p1
, const void *p2
) {
961 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
962 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
965 cmp
= 0; // make gcc happy
966 switch (frag1
->line
->blk
->page
->primaryRot
) {
968 if (fabs(cmp
= frag1
->yMin
- frag2
->yMin
) < 0.01) {
969 cmp
= frag1
->xMin
- frag2
->xMin
;
973 if (fabs(cmp
= frag2
->xMax
- frag1
->xMax
) < 0.01) {
974 cmp
= frag1
->yMin
- frag2
->yMin
;
978 if (fabs(cmp
= frag2
->yMin
- frag1
->yMin
) < 0.01) {
979 cmp
= frag2
->xMax
- frag1
->xMax
;
983 if (fabs(cmp
= frag1
->xMax
- frag2
->xMax
) < 0.01) {
984 cmp
= frag2
->yMax
- frag1
->yMax
;
988 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
991 int TextLineFrag::cmpYXLineRot(const void *p1
, const void *p2
) {
992 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
993 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
996 cmp
= 0; // make gcc happy
997 switch (frag1
->line
->rot
) {
999 if ((cmp
= frag1
->yMin
- frag2
->yMin
) == 0) {
1000 cmp
= frag1
->xMin
- frag2
->xMin
;
1004 if ((cmp
= frag2
->xMax
- frag1
->xMax
) == 0) {
1005 cmp
= frag1
->yMin
- frag2
->yMin
;
1009 if ((cmp
= frag2
->yMin
- frag1
->yMin
) == 0) {
1010 cmp
= frag2
->xMax
- frag1
->xMax
;
1014 if ((cmp
= frag1
->xMax
- frag2
->xMax
) == 0) {
1015 cmp
= frag2
->yMax
- frag1
->yMax
;
1019 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1022 int TextLineFrag::cmpXYLineRot(const void *p1
, const void *p2
) {
1023 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1024 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1027 cmp
= 0; // make gcc happy
1028 switch (frag1
->line
->rot
) {
1030 if ((cmp
= frag1
->xMin
- frag2
->xMin
) == 0) {
1031 cmp
= frag1
->yMin
- frag2
->yMin
;
1035 if ((cmp
= frag1
->yMin
- frag2
->yMin
) == 0) {
1036 cmp
= frag2
->xMax
- frag1
->xMax
;
1040 if ((cmp
= frag2
->xMax
- frag1
->xMax
) == 0) {
1041 cmp
= frag2
->yMin
- frag1
->yMin
;
1045 if ((cmp
= frag2
->yMax
- frag1
->yMax
) == 0) {
1046 cmp
= frag1
->xMax
- frag2
->xMax
;
1050 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1053 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1
, const void *p2
) {
1054 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1055 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1058 // if columns overlap, compare y values
1059 if (frag1
->col
< frag2
->col
+ (frag2
->line
->col
[frag2
->start
+ frag2
->len
] -
1060 frag2
->line
->col
[frag2
->start
]) &&
1061 frag2
->col
< frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
1062 frag1
->line
->col
[frag1
->start
])) {
1063 cmp
= 0; // make gcc happy
1064 switch (frag1
->line
->blk
->page
->primaryRot
) {
1065 case 0: cmp
= frag1
->yMin
- frag2
->yMin
; break;
1066 case 1: cmp
= frag2
->xMax
- frag1
->xMax
; break;
1067 case 2: cmp
= frag2
->yMin
- frag1
->yMin
; break;
1068 case 3: cmp
= frag1
->xMax
- frag2
->xMax
; break;
1070 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1073 // otherwise, compare starting column
1074 return frag1
->col
- frag2
->col
;
1077 int TextLineFrag::cmpXYColumnLineRot(const void *p1
, const void *p2
) {
1078 TextLineFrag
*frag1
= (TextLineFrag
*)p1
;
1079 TextLineFrag
*frag2
= (TextLineFrag
*)p2
;
1082 // if columns overlap, compare y values
1083 if (frag1
->col
< frag2
->col
+ (frag2
->line
->col
[frag2
->start
+ frag2
->len
] -
1084 frag2
->line
->col
[frag2
->start
]) &&
1085 frag2
->col
< frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
1086 frag1
->line
->col
[frag1
->start
])) {
1087 cmp
= 0; // make gcc happy
1088 switch (frag1
->line
->rot
) {
1089 case 0: cmp
= frag1
->yMin
- frag2
->yMin
; break;
1090 case 1: cmp
= frag2
->xMax
- frag1
->xMax
; break;
1091 case 2: cmp
= frag2
->yMin
- frag1
->yMin
; break;
1092 case 3: cmp
= frag1
->xMax
- frag2
->xMax
; break;
1094 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1097 // otherwise, compare starting column
1098 return frag1
->col
- frag2
->col
;
1101 //------------------------------------------------------------------------
1103 //------------------------------------------------------------------------
1105 TextBlock::TextBlock(TextPage
*pageA
, int rotA
) {
1111 priMax
= page
->pageWidth
;
1112 pool
= new TextPool();
1119 TextBlock::~TextBlock() {
1125 lines
= lines
->next
;
1130 void TextBlock::addWord(TextWord
*word
) {
1131 pool
->addWord(word
);
1138 if (word
->xMin
< xMin
) {
1141 if (word
->xMax
> xMax
) {
1144 if (word
->yMin
< yMin
) {
1147 if (word
->yMax
> yMax
) {
1153 void TextBlock::coalesce(UnicodeMap
*uMap
) {
1154 TextWord
*word0
, *word1
, *word2
, *bestWord0
, *bestWord1
, *lastWord
;
1155 TextLine
*line
, *line0
, *line1
;
1156 int poolMinBaseIdx
, startBaseIdx
, minBaseIdx
, maxBaseIdx
;
1157 int baseIdx
, bestWordBaseIdx
, idx0
, idx1
;
1158 double minBase
, maxBase
;
1159 double fontSize
, delta
, priDelta
, secDelta
;
1160 TextLine
**lineArray
;
1165 // discard duplicated text (fake boldface, drop shadows)
1166 for (idx0
= pool
->minBaseIdx
; idx0
<= pool
->maxBaseIdx
; ++idx0
) {
1167 word0
= pool
->getPool(idx0
);
1169 priDelta
= dupMaxPriDelta
* word0
->fontSize
;
1170 secDelta
= dupMaxSecDelta
* word0
->fontSize
;
1171 if (rot
== 0 || rot
== 3) {
1172 maxBaseIdx
= pool
->getBaseIdx(word0
->base
+ secDelta
);
1174 maxBaseIdx
= pool
->getBaseIdx(word0
->base
- secDelta
);
1177 word1
= word2
= NULL
; // make gcc happy
1178 for (idx1
= idx0
; idx1
<= maxBaseIdx
; ++idx1
) {
1181 word2
= word0
->next
;
1184 word2
= pool
->getPool(idx1
);
1186 for (; word2
; word1
= word2
, word2
= word2
->next
) {
1187 if (word2
->len
== word0
->len
&&
1188 !memcmp(word2
->text
, word0
->text
,
1189 word0
->len
* sizeof(Unicode
))) {
1193 found
= fabs(word0
->xMin
- word2
->xMin
) < priDelta
&&
1194 fabs(word0
->xMax
- word2
->xMax
) < priDelta
&&
1195 fabs(word0
->yMin
- word2
->yMin
) < secDelta
&&
1196 fabs(word0
->yMax
- word2
->yMax
) < secDelta
;
1200 found
= fabs(word0
->xMin
- word2
->xMin
) < secDelta
&&
1201 fabs(word0
->xMax
- word2
->xMax
) < secDelta
&&
1202 fabs(word0
->yMin
- word2
->yMin
) < priDelta
&&
1203 fabs(word0
->yMax
- word2
->yMax
) < priDelta
;
1217 word1
->next
= word2
->next
;
1219 pool
->setPool(idx1
, word2
->next
);
1223 word0
= word0
->next
;
1230 poolMinBaseIdx
= pool
->minBaseIdx
;
1235 // find the first non-empty line in the pool
1237 poolMinBaseIdx
<= pool
->maxBaseIdx
&& !pool
->getPool(poolMinBaseIdx
);
1239 if (poolMinBaseIdx
> pool
->maxBaseIdx
) {
1243 // look for the left-most word in the first four lines of the
1244 // pool -- this avoids starting with a superscript word
1245 startBaseIdx
= poolMinBaseIdx
;
1246 for (baseIdx
= poolMinBaseIdx
+ 1;
1247 baseIdx
< poolMinBaseIdx
+ 4 && baseIdx
<= pool
->maxBaseIdx
;
1249 if (!pool
->getPool(baseIdx
)) {
1252 if (pool
->getPool(baseIdx
)->primaryCmp(pool
->getPool(startBaseIdx
))
1254 startBaseIdx
= baseIdx
;
1258 // create a new line
1259 word0
= pool
->getPool(startBaseIdx
);
1260 pool
->setPool(startBaseIdx
, word0
->next
);
1262 line
= new TextLine(this, word0
->rot
, word0
->base
);
1263 line
->addWord(word0
);
1266 // compute the search range
1267 fontSize
= word0
->fontSize
;
1268 minBase
= word0
->base
- maxIntraLineDelta
* fontSize
;
1269 maxBase
= word0
->base
+ maxIntraLineDelta
* fontSize
;
1270 minBaseIdx
= pool
->getBaseIdx(minBase
);
1271 maxBaseIdx
= pool
->getBaseIdx(maxBase
);
1273 // find the rest of the words in this line
1276 // find the left-most word whose baseline is in the range for
1278 bestWordBaseIdx
= 0;
1279 bestWord0
= bestWord1
= NULL
;
1280 for (baseIdx
= minBaseIdx
; baseIdx
<= maxBaseIdx
; ++baseIdx
) {
1281 for (word0
= NULL
, word1
= pool
->getPool(baseIdx
);
1283 word0
= word1
, word1
= word1
->next
) {
1284 if (word1
->base
>= minBase
&&
1285 word1
->base
<= maxBase
&&
1286 (delta
= lastWord
->primaryDelta(word1
)) >=
1287 minCharSpacing
* fontSize
) {
1288 if (delta
< maxWordSpacing
* fontSize
&&
1289 (!bestWord1
|| word1
->primaryCmp(bestWord1
) < 0)) {
1290 bestWordBaseIdx
= baseIdx
;
1302 // remove it from the pool, and add it to the line
1304 bestWord0
->next
= bestWord1
->next
;
1306 pool
->setPool(bestWordBaseIdx
, bestWord1
->next
);
1308 bestWord1
->next
= NULL
;
1309 line
->addWord(bestWord1
);
1310 lastWord
= bestWord1
;
1314 if (curLine
&& line
->cmpYX(curLine
) > 0) {
1316 line1
= curLine
->next
;
1322 line1
&& line
->cmpYX(line1
) > 0;
1323 line0
= line1
, line1
= line1
->next
) ;
1331 line
->coalesce(uMap
);
1332 charCount
+= line
->len
;
1336 // sort lines into xy order for column assignment
1337 lineArray
= (TextLine
**)gmallocn(nLines
, sizeof(TextLine
*));
1338 for (line
= lines
, i
= 0; line
; line
= line
->next
, ++i
) {
1339 lineArray
[i
] = line
;
1341 qsort(lineArray
, nLines
, sizeof(TextLine
*), &TextLine::cmpXY
);
1343 // column assignment
1345 for (i
= 0; i
< nLines
; ++i
) {
1346 line0
= lineArray
[i
];
1348 for (j
= 0; j
< i
; ++j
) {
1349 line1
= lineArray
[j
];
1350 if (line1
->primaryDelta(line0
) >= 0) {
1351 col2
= line1
->col
[line1
->len
] + 1;
1353 k
= 0; // make gcc happy
1358 line0
->xMin
>= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1364 line0
->yMin
>= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1370 line0
->xMax
<= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1376 line0
->yMax
<= 0.5 * (line1
->edge
[k
] + line1
->edge
[k
+1]);
1380 col2
= line1
->col
[k
];
1386 for (k
= 0; k
<= line0
->len
; ++k
) {
1387 line0
->col
[k
] += col1
;
1389 if (line0
->col
[line0
->len
] > nColumns
) {
1390 nColumns
= line0
->col
[line0
->len
];
1396 void TextBlock::updatePriMinMax(TextBlock
*blk
) {
1397 double newPriMin
, newPriMax
;
1398 GBool gotPriMin
, gotPriMax
;
1400 gotPriMin
= gotPriMax
= gFalse
;
1401 newPriMin
= newPriMax
= 0; // make gcc happy
1402 switch (page
->primaryRot
) {
1405 if (blk
->yMin
< yMax
&& blk
->yMax
> yMin
) {
1406 if (blk
->xMin
< xMin
) {
1407 newPriMin
= blk
->xMax
;
1410 if (blk
->xMax
> xMax
) {
1411 newPriMax
= blk
->xMin
;
1418 if (blk
->xMin
< xMax
&& blk
->xMax
> xMin
) {
1419 if (blk
->yMin
< yMin
) {
1420 newPriMin
= blk
->yMax
;
1423 if (blk
->yMax
> yMax
) {
1424 newPriMax
= blk
->yMin
;
1431 if (newPriMin
> xMin
) {
1434 if (newPriMin
> priMin
) {
1439 if (newPriMax
< xMax
) {
1442 if (newPriMax
< priMax
) {
1448 int TextBlock::cmpXYPrimaryRot(const void *p1
, const void *p2
) {
1449 TextBlock
*blk1
= *(TextBlock
**)p1
;
1450 TextBlock
*blk2
= *(TextBlock
**)p2
;
1453 cmp
= 0; // make gcc happy
1454 switch (blk1
->page
->primaryRot
) {
1456 if ((cmp
= blk1
->xMin
- blk2
->xMin
) == 0) {
1457 cmp
= blk1
->yMin
- blk2
->yMin
;
1461 if ((cmp
= blk1
->yMin
- blk2
->yMin
) == 0) {
1462 cmp
= blk2
->xMax
- blk1
->xMax
;
1466 if ((cmp
= blk2
->xMax
- blk1
->xMax
) == 0) {
1467 cmp
= blk2
->yMin
- blk1
->yMin
;
1471 if ((cmp
= blk2
->yMax
- blk1
->yMax
) == 0) {
1472 cmp
= blk1
->xMax
- blk2
->xMax
;
1476 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1479 int TextBlock::cmpYXPrimaryRot(const void *p1
, const void *p2
) {
1480 TextBlock
*blk1
= *(TextBlock
**)p1
;
1481 TextBlock
*blk2
= *(TextBlock
**)p2
;
1484 cmp
= 0; // make gcc happy
1485 switch (blk1
->page
->primaryRot
) {
1487 if ((cmp
= blk1
->yMin
- blk2
->yMin
) == 0) {
1488 cmp
= blk1
->xMin
- blk2
->xMin
;
1492 if ((cmp
= blk2
->xMax
- blk1
->xMax
) == 0) {
1493 cmp
= blk1
->yMin
- blk2
->yMin
;
1497 if ((cmp
= blk2
->yMin
- blk1
->yMin
) == 0) {
1498 cmp
= blk2
->xMax
- blk1
->xMax
;
1502 if ((cmp
= blk1
->xMax
- blk2
->xMax
) == 0) {
1503 cmp
= blk2
->yMax
- blk1
->yMax
;
1507 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1510 int TextBlock::primaryCmp(TextBlock
*blk
) {
1513 cmp
= 0; // make gcc happy
1516 cmp
= xMin
- blk
->xMin
;
1519 cmp
= yMin
- blk
->yMin
;
1522 cmp
= blk
->xMax
- xMax
;
1525 cmp
= blk
->yMax
- yMax
;
1528 return cmp
< 0 ? -1 : cmp
> 0 ? 1 : 0;
1531 double TextBlock::secondaryDelta(TextBlock
*blk
) {
1534 delta
= 0; // make gcc happy
1537 delta
= blk
->yMin
- yMax
;
1540 delta
= xMin
- blk
->xMax
;
1543 delta
= yMin
- blk
->yMax
;
1546 delta
= blk
->xMin
- xMax
;
1552 GBool
TextBlock::isBelow(TextBlock
*blk
) {
1555 below
= gFalse
; // make gcc happy
1556 switch (page
->primaryRot
) {
1558 below
= xMin
>= blk
->priMin
&& xMax
<= blk
->priMax
&&
1562 below
= yMin
>= blk
->priMin
&& yMax
<= blk
->priMax
&&
1566 below
= xMin
>= blk
->priMin
&& xMax
<= blk
->priMax
&&
1570 below
= yMin
>= blk
->priMin
&& yMax
<= blk
->priMax
&&
1578 //------------------------------------------------------------------------
1580 //------------------------------------------------------------------------
1582 TextFlow::TextFlow(TextPage
*pageA
, TextBlock
*blk
) {
1588 priMin
= blk
->priMin
;
1589 priMax
= blk
->priMax
;
1590 blocks
= lastBlk
= blk
;
1594 TextFlow::~TextFlow() {
1599 blocks
= blocks
->next
;
1604 void TextFlow::addBlock(TextBlock
*blk
) {
1606 lastBlk
->next
= blk
;
1611 if (blk
->xMin
< xMin
) {
1614 if (blk
->xMax
> xMax
) {
1617 if (blk
->yMin
< yMin
) {
1620 if (blk
->yMax
> yMax
) {
1625 GBool
TextFlow::blockFits(TextBlock
*blk
, TextBlock
*prevBlk
) {
1628 // lower blocks must use smaller fonts
1629 if (blk
->lines
->words
->fontSize
> lastBlk
->lines
->words
->fontSize
) {
1633 fits
= gFalse
; // make gcc happy
1634 switch (page
->primaryRot
) {
1636 fits
= blk
->xMin
>= priMin
&& blk
->xMax
<= priMax
;
1639 fits
= blk
->yMin
>= priMin
&& blk
->yMax
<= priMax
;
1642 fits
= blk
->xMin
>= priMin
&& blk
->xMax
<= priMax
;
1645 fits
= blk
->yMin
>= priMin
&& blk
->yMax
<= priMax
;
1651 #if TEXTOUT_WORD_LIST
1653 //------------------------------------------------------------------------
1655 //------------------------------------------------------------------------
1657 TextWordList::TextWordList(TextPage
*text
, GBool physLayout
) {
1662 TextWord
**wordArray
;
1665 words
= new GList();
1667 if (text
->rawOrder
) {
1668 for (word
= text
->rawWords
; word
; word
= word
->next
) {
1669 words
->append(word
);
1672 } else if (physLayout
) {
1673 // this is inefficient, but it's also the least useful of these
1676 for (flow
= text
->flows
; flow
; flow
= flow
->next
) {
1677 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
1678 for (line
= blk
->lines
; line
; line
= line
->next
) {
1679 for (word
= line
->words
; word
; word
= word
->next
) {
1685 wordArray
= (TextWord
**)gmallocn(nWords
, sizeof(TextWord
*));
1687 for (flow
= text
->flows
; flow
; flow
= flow
->next
) {
1688 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
1689 for (line
= blk
->lines
; line
; line
= line
->next
) {
1690 for (word
= line
->words
; word
; word
= word
->next
) {
1691 wordArray
[i
++] = word
;
1696 qsort(wordArray
, nWords
, sizeof(TextWord
*), &TextWord::cmpYX
);
1697 for (i
= 0; i
< nWords
; ++i
) {
1698 words
->append(wordArray
[i
]);
1703 for (flow
= text
->flows
; flow
; flow
= flow
->next
) {
1704 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
1705 for (line
= blk
->lines
; line
; line
= line
->next
) {
1706 for (word
= line
->words
; word
; word
= word
->next
) {
1707 words
->append(word
);
1715 TextWordList::~TextWordList() {
1719 int TextWordList::getLength() {
1720 return words
->getLength();
1723 TextWord
*TextWordList::get(int idx
) {
1724 if (idx
< 0 || idx
>= words
->getLength()) {
1727 return (TextWord
*)words
->get(idx
);
1730 #endif // TEXTOUT_WORD_LIST
1732 //------------------------------------------------------------------------
1734 //------------------------------------------------------------------------
1736 TextPage::TextPage(GBool rawOrderA
) {
1739 rawOrder
= rawOrderA
;
1746 lastCharOverlap
= gFalse
;
1748 for (rot
= 0; rot
< 4; ++rot
) {
1749 pools
[rot
] = new TextPool();
1756 fonts
= new GList();
1757 lastFindXMin
= lastFindYMin
= 0;
1758 haveLastFind
= gFalse
;
1759 underlines
= new GList();
1760 links
= new GList();
1763 TextPage::~TextPage() {
1768 for (rot
= 0; rot
< 4; ++rot
) {
1773 deleteGList(underlines
, TextUnderline
);
1774 deleteGList(links
, TextLink
);
1777 void TextPage::startPage(GfxState
*state
) {
1780 pageWidth
= state
->getPageWidth();
1781 pageHeight
= state
->getPageHeight();
1783 pageWidth
= pageHeight
= 0;
1787 void TextPage::endPage() {
1793 void TextPage::clear() {
1805 rawWords
= rawWords
->next
;
1809 for (rot
= 0; rot
< 4; ++rot
) {
1814 flows
= flows
->next
;
1819 deleteGList(fonts
, TextFontInfo
);
1828 for (rot
= 0; rot
< 4; ++rot
) {
1829 pools
[rot
] = new TextPool();
1836 fonts
= new GList();
1839 void TextPage::updateFont(GfxState
*state
) {
1843 int code
, mCode
, letterCode
, anyCode
;
1847 // get the font info xObject
1849 for (i
= 0; i
< fonts
->getLength(); ++i
) {
1850 curFont
= (TextFontInfo
*)fonts
->get(i
);
1851 if (curFont
->matches(state
)) {
1857 curFont
= new TextFontInfo(state
);
1858 fonts
->append(curFont
);
1861 // adjust the font size
1862 gfxFont
= state
->getFont();
1863 curFontSize
= state
->getTransformedFontSize();
1864 if (gfxFont
&& gfxFont
->getType() == fontType3
) {
1865 // This is a hack which makes it possible to deal with some Type 3
1866 // fonts. The problem is that it's impossible to know what the
1867 // base coordinate system used in the font is without actually
1868 // rendering the font. This code tries to guess by looking at the
1869 // width of the character 'm' (which breaks if the font is a
1870 // subset that doesn't contain 'm').
1871 mCode
= letterCode
= anyCode
= -1;
1872 for (code
= 0; code
< 256; ++code
) {
1873 name
= ((Gfx8BitFont
*)gfxFont
)->getCharName(code
);
1874 if (name
&& name
[0] == 'm' && name
[1] == '\0') {
1877 if (letterCode
< 0 && name
&& name
[1] == '\0' &&
1878 ((name
[0] >= 'A' && name
[0] <= 'Z') ||
1879 (name
[0] >= 'a' && name
[0] <= 'z'))) {
1882 if (anyCode
< 0 && name
&&
1883 ((Gfx8BitFont
*)gfxFont
)->getWidth(code
) > 0) {
1888 (w
= ((Gfx8BitFont
*)gfxFont
)->getWidth(mCode
)) > 0) {
1889 // 0.6 is a generic average 'm' width -- yes, this is a hack
1890 curFontSize
*= w
/ 0.6;
1891 } else if (letterCode
>= 0 &&
1892 (w
= ((Gfx8BitFont
*)gfxFont
)->getWidth(letterCode
)) > 0) {
1893 // even more of a hack: 0.5 is a generic letter width
1894 curFontSize
*= w
/ 0.5;
1895 } else if (anyCode
>= 0 &&
1896 (w
= ((Gfx8BitFont
*)gfxFont
)->getWidth(anyCode
)) > 0) {
1897 // better than nothing: 0.5 is a generic character width
1898 curFontSize
*= w
/ 0.5;
1900 fm
= gfxFont
->getFontMatrix();
1902 curFontSize
*= fabs(fm
[3] / fm
[0]);
1907 void TextPage::beginWord(GfxState
*state
, double x0
, double y0
) {
1912 // This check is needed because Type 3 characters can contain
1913 // text-drawing operations (when TextPage is being used via
1914 // {X,Win}SplashOutputDev rather than TextOutputDev).
1920 // compute the rotation
1921 state
->getFontTransMat(&m
[0], &m
[1], &m
[2], &m
[3]);
1922 if (state
->getFont()->getType() == fontType3
) {
1923 fontm
= state
->getFont()->getFontMatrix();
1924 m2
[0] = fontm
[0] * m
[0] + fontm
[1] * m
[2];
1925 m2
[1] = fontm
[0] * m
[1] + fontm
[1] * m
[3];
1926 m2
[2] = fontm
[2] * m
[0] + fontm
[3] * m
[2];
1927 m2
[3] = fontm
[2] * m
[1] + fontm
[3] * m
[3];
1933 if (fabs(m
[0] * m
[3]) > fabs(m
[1] * m
[2])) {
1934 rot
= (m
[3] < 0) ? 0 : 2;
1936 rot
= (m
[2] > 0) ? 1 : 3;
1939 curWord
= new TextWord(state
, rot
, x0
, y0
, charPos
, curFont
, curFontSize
);
1942 void TextPage::addChar(GfxState
*state
, double x
, double y
,
1943 double dx
, double dy
,
1944 CharCode c
, int nBytes
, Unicode
*u
, int uLen
) {
1945 double x1
, y1
, w1
, h1
, dx2
, dy2
, base
, sp
, delta
;
1949 // subtract char and word spacing from the dx,dy values
1950 sp
= state
->getCharSpace();
1951 if (c
== (CharCode
)0x20) {
1952 sp
+= state
->getWordSpace();
1954 state
->textTransformDelta(sp
* state
->getHorizScaling(), 0, &dx2
, &dy2
);
1957 state
->transformDelta(dx
, dy
, &w1
, &h1
);
1959 // throw away chars that aren't inside the page bounds
1960 // (and also do a sanity check on the character size)
1961 state
->transform(x
, y
, &x1
, &y1
);
1962 if (x1
+ w1
< 0 || x1
> pageWidth
||
1963 y1
+ h1
< 0 || y1
> pageHeight
||
1964 w1
> pageWidth
|| h1
> pageHeight
) {
1969 // check the tiny chars limit
1970 if (!globalParams
->getTextKeepTinyChars() &&
1971 fabs(w1
) < 3 && fabs(h1
) < 3) {
1972 if (++nTinyChars
> 50000) {
1978 // break words at space character
1979 if (uLen
== 1 && u
[0] == (Unicode
)0x20) {
1988 // start a new word if:
1989 // (1) this character doesn't fall in the right place relative to
1990 // the end of the previous word (this places upper and lower
1991 // constraints on the position deltas along both the primary
1992 // and secondary axes), or
1993 // (2) this character overlaps the previous one (duplicated text), or
1994 // (3) the previous character was an overlap (we want each duplicated
1995 // character to be in a word by itself at this stage),
1996 // (4) the font size has changed
1997 if (curWord
&& curWord
->len
> 0) {
1998 base
= sp
= delta
= 0; // make gcc happy
1999 switch (curWord
->rot
) {
2002 sp
= x1
- curWord
->xMax
;
2003 delta
= x1
- curWord
->edge
[curWord
->len
- 1];
2007 sp
= y1
- curWord
->yMax
;
2008 delta
= y1
- curWord
->edge
[curWord
->len
- 1];
2012 sp
= curWord
->xMin
- x1
;
2013 delta
= curWord
->edge
[curWord
->len
- 1] - x1
;
2017 sp
= curWord
->yMin
- y1
;
2018 delta
= curWord
->edge
[curWord
->len
- 1] - y1
;
2021 overlap
= fabs(delta
) < dupMaxPriDelta
* curWord
->fontSize
&&
2022 fabs(base
- curWord
->base
) < dupMaxSecDelta
* curWord
->fontSize
;
2023 if (overlap
|| lastCharOverlap
||
2024 sp
< -minDupBreakOverlap
* curWord
->fontSize
||
2025 sp
> minWordBreakSpace
* curWord
->fontSize
||
2026 fabs(base
- curWord
->base
) > 0.5 ||
2027 curFontSize
!= curWord
->fontSize
) {
2030 lastCharOverlap
= overlap
;
2032 lastCharOverlap
= gFalse
;
2036 // start a new word if needed
2038 beginWord(state
, x
, y
);
2041 // page rotation and/or transform matrices can cause text to be
2042 // drawn in reverse order -- in this case, swap the begin/end
2043 // coordinates and break text into individual chars
2044 if ((curWord
->rot
== 0 && w1
< 0) ||
2045 (curWord
->rot
== 1 && h1
< 0) ||
2046 (curWord
->rot
== 2 && w1
> 0) ||
2047 (curWord
->rot
== 3 && h1
> 0)) {
2049 beginWord(state
, x
+ dx
, y
+ dy
);
2056 // add the characters to the current word
2059 for (i
= 0; i
< uLen
; ++i
) {
2060 curWord
->addChar(state
, x1
+ i
*w1
, y1
+ i
*h1
, w1
, h1
, u
[i
]);
2064 curWord
->charLen
+= nBytes
;
2069 void TextPage::endWord() {
2070 // This check is needed because Type 3 characters can contain
2071 // text-drawing operations (when TextPage is being used via
2072 // {X,Win}SplashOutputDev rather than TextOutputDev).
2084 void TextPage::addWord(TextWord
*word
) {
2085 // throw away zero-length words -- they don't have valid xMin/xMax
2086 // values, and they're useless anyway
2087 if (word
->len
== 0) {
2094 rawLastWord
->next
= word
;
2100 pools
[word
->rot
]->addWord(word
);
2104 void TextPage::addUnderline(double x0
, double y0
, double x1
, double y1
) {
2105 underlines
->append(new TextUnderline(x0
, y0
, x1
, y1
));
2108 void TextPage::addLink(int xMin
, int yMin
, int xMax
, int yMax
, Link
*link
) {
2109 links
->append(new TextLink(xMin
, yMin
, xMax
, yMax
, link
));
2112 void TextPage::coalesce(GBool physLayout
, GBool doHTML
) {
2115 TextWord
*word0
, *word1
, *word2
;
2117 TextBlock
*blkList
, *blkStack
, *blk
, *lastBlk
, *blk0
, *blk1
;
2118 TextBlock
**blkArray
;
2119 TextFlow
*flow
, *lastFlow
;
2120 TextUnderline
*underline
;
2122 int rot
, poolMinBaseIdx
, baseIdx
, startBaseIdx
, endBaseIdx
;
2123 double minBase
, maxBase
, newMinBase
, newMaxBase
;
2124 double fontSize
, colSpace1
, colSpace2
, lineSpace
, intraLineSpace
, blkSpace
;
2128 int firstBlkIdx
, nBlocksLeft
;
2138 uMap
= globalParams
->getTextEncoding();
2144 #if 0 // for debugging
2145 printf("*** initial words ***\n");
2146 for (rot
= 0; rot
< 4; ++rot
) {
2148 for (baseIdx
= pool
->minBaseIdx
; baseIdx
<= pool
->maxBaseIdx
; ++baseIdx
) {
2149 for (word0
= pool
->getPool(baseIdx
); word0
; word0
= word0
->next
) {
2150 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2151 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
2152 word0
->base
, word0
->fontSize
, rot
*90, word0
->link
);
2153 for (i
= 0; i
< word0
->len
; ++i
) {
2154 fputc(word0
->text
[i
] & 0xff, stdout
);
2163 #if 0 //~ for debugging
2164 for (i
= 0; i
< underlines
->getLength(); ++i
) {
2165 underline
= (TextUnderline
*)underlines
->get(i
);
2166 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2167 underline
->x0
, underline
->x1
, underline
->y0
, underline
->y1
,
2174 //----- handle underlining
2175 for (i
= 0; i
< underlines
->getLength(); ++i
) {
2176 underline
= (TextUnderline
*)underlines
->get(i
);
2177 if (underline
->horiz
) {
2179 if (pools
[0]->minBaseIdx
<= pools
[0]->maxBaseIdx
) {
2180 startBaseIdx
= pools
[0]->getBaseIdx(underline
->y0
+ minUnderlineGap
);
2181 endBaseIdx
= pools
[0]->getBaseIdx(underline
->y0
+ maxUnderlineGap
);
2182 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2183 for (word0
= pools
[0]->getPool(j
); word0
; word0
= word0
->next
) {
2184 //~ need to check the y value against the word baseline
2185 if (underline
->x0
< word0
->xMin
+ underlineSlack
&&
2186 word0
->xMax
- underlineSlack
< underline
->x1
) {
2187 word0
->underlined
= gTrue
;
2194 if (pools
[2]->minBaseIdx
<= pools
[2]->maxBaseIdx
) {
2195 startBaseIdx
= pools
[2]->getBaseIdx(underline
->y0
- maxUnderlineGap
);
2196 endBaseIdx
= pools
[2]->getBaseIdx(underline
->y0
- minUnderlineGap
);
2197 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2198 for (word0
= pools
[2]->getPool(j
); word0
; word0
= word0
->next
) {
2199 if (underline
->x0
< word0
->xMin
+ underlineSlack
&&
2200 word0
->xMax
- underlineSlack
< underline
->x1
) {
2201 word0
->underlined
= gTrue
;
2208 if (pools
[1]->minBaseIdx
<= pools
[1]->maxBaseIdx
) {
2209 startBaseIdx
= pools
[1]->getBaseIdx(underline
->x0
- maxUnderlineGap
);
2210 endBaseIdx
= pools
[1]->getBaseIdx(underline
->x0
- minUnderlineGap
);
2211 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2212 for (word0
= pools
[1]->getPool(j
); word0
; word0
= word0
->next
) {
2213 if (underline
->y0
< word0
->yMin
+ underlineSlack
&&
2214 word0
->yMax
- underlineSlack
< underline
->y1
) {
2215 word0
->underlined
= gTrue
;
2222 if (pools
[3]->minBaseIdx
<= pools
[3]->maxBaseIdx
) {
2223 startBaseIdx
= pools
[3]->getBaseIdx(underline
->x0
+ minUnderlineGap
);
2224 endBaseIdx
= pools
[3]->getBaseIdx(underline
->x0
+ maxUnderlineGap
);
2225 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2226 for (word0
= pools
[3]->getPool(j
); word0
; word0
= word0
->next
) {
2227 if (underline
->y0
< word0
->yMin
+ underlineSlack
&&
2228 word0
->yMax
- underlineSlack
< underline
->y1
) {
2229 word0
->underlined
= gTrue
;
2237 //----- handle links
2238 for (i
= 0; i
< links
->getLength(); ++i
) {
2239 link
= (TextLink
*)links
->get(i
);
2242 if (pools
[0]->minBaseIdx
<= pools
[0]->maxBaseIdx
) {
2243 startBaseIdx
= pools
[0]->getBaseIdx(link
->yMin
);
2244 endBaseIdx
= pools
[0]->getBaseIdx(link
->yMax
);
2245 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2246 for (word0
= pools
[0]->getPool(j
); word0
; word0
= word0
->next
) {
2247 if (link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2248 word0
->xMax
- hyperlinkSlack
< link
->xMax
&&
2249 link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2250 word0
->yMax
- hyperlinkSlack
< link
->yMax
) {
2251 word0
->link
= link
->link
;
2258 if (pools
[2]->minBaseIdx
<= pools
[2]->maxBaseIdx
) {
2259 startBaseIdx
= pools
[2]->getBaseIdx(link
->yMin
);
2260 endBaseIdx
= pools
[2]->getBaseIdx(link
->yMax
);
2261 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2262 for (word0
= pools
[2]->getPool(j
); word0
; word0
= word0
->next
) {
2263 if (link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2264 word0
->xMax
- hyperlinkSlack
< link
->xMax
&&
2265 link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2266 word0
->yMax
- hyperlinkSlack
< link
->yMax
) {
2267 word0
->link
= link
->link
;
2274 if (pools
[1]->minBaseIdx
<= pools
[1]->maxBaseIdx
) {
2275 startBaseIdx
= pools
[1]->getBaseIdx(link
->xMin
);
2276 endBaseIdx
= pools
[1]->getBaseIdx(link
->xMax
);
2277 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2278 for (word0
= pools
[1]->getPool(j
); word0
; word0
= word0
->next
) {
2279 if (link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2280 word0
->yMax
- hyperlinkSlack
< link
->yMax
&&
2281 link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2282 word0
->xMax
- hyperlinkSlack
< link
->xMax
) {
2283 word0
->link
= link
->link
;
2290 if (pools
[3]->minBaseIdx
<= pools
[3]->maxBaseIdx
) {
2291 startBaseIdx
= pools
[3]->getBaseIdx(link
->xMin
);
2292 endBaseIdx
= pools
[3]->getBaseIdx(link
->xMax
);
2293 for (j
= startBaseIdx
; j
<= endBaseIdx
; ++j
) {
2294 for (word0
= pools
[3]->getPool(j
); word0
; word0
= word0
->next
) {
2295 if (link
->yMin
< word0
->yMin
+ hyperlinkSlack
&&
2296 word0
->yMax
- hyperlinkSlack
< link
->yMax
&&
2297 link
->xMin
< word0
->xMin
+ hyperlinkSlack
&&
2298 word0
->xMax
- hyperlinkSlack
< link
->xMax
) {
2299 word0
->link
= link
->link
;
2307 //----- assemble the blocks
2309 //~ add an outer loop for writing mode (vertical text)
2311 // build blocks for each rotation value
2312 for (rot
= 0; rot
< 4; ++rot
) {
2314 poolMinBaseIdx
= pool
->minBaseIdx
;
2317 // add blocks until no more words are left
2320 // find the first non-empty line in the pool
2322 poolMinBaseIdx
<= pool
->maxBaseIdx
&&
2323 !pool
->getPool(poolMinBaseIdx
);
2325 if (poolMinBaseIdx
> pool
->maxBaseIdx
) {
2329 // look for the left-most word in the first four lines of the
2330 // pool -- this avoids starting with a superscript word
2331 startBaseIdx
= poolMinBaseIdx
;
2332 for (baseIdx
= poolMinBaseIdx
+ 1;
2333 baseIdx
< poolMinBaseIdx
+ 4 && baseIdx
<= pool
->maxBaseIdx
;
2335 if (!pool
->getPool(baseIdx
)) {
2338 if (pool
->getPool(baseIdx
)->primaryCmp(pool
->getPool(startBaseIdx
))
2340 startBaseIdx
= baseIdx
;
2344 // create a new block
2345 word0
= pool
->getPool(startBaseIdx
);
2346 pool
->setPool(startBaseIdx
, word0
->next
);
2348 blk
= new TextBlock(this, rot
);
2349 blk
->addWord(word0
);
2351 fontSize
= word0
->fontSize
;
2352 minBase
= maxBase
= word0
->base
;
2353 colSpace1
= minColSpacing1
* fontSize
;
2354 colSpace2
= minColSpacing2
* fontSize
;
2355 lineSpace
= maxLineSpacingDelta
* fontSize
;
2356 intraLineSpace
= maxIntraLineDelta
* fontSize
;
2358 // add words to the block
2362 // look for words on the line above the current top edge of
2364 newMinBase
= minBase
;
2365 for (baseIdx
= pool
->getBaseIdx(minBase
);
2366 baseIdx
>= pool
->getBaseIdx(minBase
- lineSpace
);
2369 word1
= pool
->getPool(baseIdx
);
2371 if (word1
->base
< minBase
&&
2372 word1
->base
>= minBase
- lineSpace
&&
2373 ((rot
== 0 || rot
== 2)
2374 ? (word1
->xMin
< blk
->xMax
&& word1
->xMax
> blk
->xMin
)
2375 : (word1
->yMin
< blk
->yMax
&& word1
->yMax
> blk
->yMin
)) &&
2376 fabs(word1
->fontSize
- fontSize
) <
2377 maxBlockFontSizeDelta1
* fontSize
) {
2380 word0
->next
= word1
->next
;
2382 pool
->setPool(baseIdx
, word1
->next
);
2384 word1
= word1
->next
;
2386 blk
->addWord(word2
);
2388 newMinBase
= word2
->base
;
2391 word1
= word1
->next
;
2395 minBase
= newMinBase
;
2397 // look for words on the line below the current bottom edge of
2399 newMaxBase
= maxBase
;
2400 for (baseIdx
= pool
->getBaseIdx(maxBase
);
2401 baseIdx
<= pool
->getBaseIdx(maxBase
+ lineSpace
);
2404 word1
= pool
->getPool(baseIdx
);
2406 if (word1
->base
> maxBase
&&
2407 word1
->base
<= maxBase
+ lineSpace
&&
2408 ((rot
== 0 || rot
== 2)
2409 ? (word1
->xMin
< blk
->xMax
&& word1
->xMax
> blk
->xMin
)
2410 : (word1
->yMin
< blk
->yMax
&& word1
->yMax
> blk
->yMin
)) &&
2411 fabs(word1
->fontSize
- fontSize
) <
2412 maxBlockFontSizeDelta1
* fontSize
) {
2415 word0
->next
= word1
->next
;
2417 pool
->setPool(baseIdx
, word1
->next
);
2419 word1
= word1
->next
;
2421 blk
->addWord(word2
);
2423 newMaxBase
= word2
->base
;
2426 word1
= word1
->next
;
2430 maxBase
= newMaxBase
;
2432 // look for words that are on lines already in the block, and
2433 // that overlap the block horizontally
2434 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
2435 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
2438 word1
= pool
->getPool(baseIdx
);
2440 if (word1
->base
>= minBase
- intraLineSpace
&&
2441 word1
->base
<= maxBase
+ intraLineSpace
&&
2442 ((rot
== 0 || rot
== 2)
2443 ? (word1
->xMin
< blk
->xMax
+ colSpace1
&&
2444 word1
->xMax
> blk
->xMin
- colSpace1
)
2445 : (word1
->yMin
< blk
->yMax
+ colSpace1
&&
2446 word1
->yMax
> blk
->yMin
- colSpace1
)) &&
2447 fabs(word1
->fontSize
- fontSize
) <
2448 maxBlockFontSizeDelta2
* fontSize
) {
2451 word0
->next
= word1
->next
;
2453 pool
->setPool(baseIdx
, word1
->next
);
2455 word1
= word1
->next
;
2457 blk
->addWord(word2
);
2461 word1
= word1
->next
;
2466 // only check for outlying words (the next two chunks of code)
2467 // if we didn't find anything else
2472 // scan down the left side of the block, looking for words
2473 // that are near (but not overlapping) the block; if there are
2474 // three or fewer, add them to the block
2476 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
2477 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
2479 word1
= pool
->getPool(baseIdx
);
2481 if (word1
->base
>= minBase
- intraLineSpace
&&
2482 word1
->base
<= maxBase
+ intraLineSpace
&&
2483 ((rot
== 0 || rot
== 2)
2484 ? (word1
->xMax
<= blk
->xMin
&&
2485 word1
->xMax
> blk
->xMin
- colSpace2
)
2486 : (word1
->yMax
<= blk
->yMin
&&
2487 word1
->yMax
> blk
->yMin
- colSpace2
)) &&
2488 fabs(word1
->fontSize
- fontSize
) <
2489 maxBlockFontSizeDelta3
* fontSize
) {
2493 word1
= word1
->next
;
2496 if (n
> 0 && n
<= 3) {
2497 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
2498 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
2501 word1
= pool
->getPool(baseIdx
);
2503 if (word1
->base
>= minBase
- intraLineSpace
&&
2504 word1
->base
<= maxBase
+ intraLineSpace
&&
2505 ((rot
== 0 || rot
== 2)
2506 ? (word1
->xMax
<= blk
->xMin
&&
2507 word1
->xMax
> blk
->xMin
- colSpace2
)
2508 : (word1
->yMax
<= blk
->yMin
&&
2509 word1
->yMax
> blk
->yMin
- colSpace2
)) &&
2510 fabs(word1
->fontSize
- fontSize
) <
2511 maxBlockFontSizeDelta3
* fontSize
) {
2514 word0
->next
= word1
->next
;
2516 pool
->setPool(baseIdx
, word1
->next
);
2518 word1
= word1
->next
;
2520 blk
->addWord(word2
);
2521 if (word2
->base
< minBase
) {
2522 minBase
= word2
->base
;
2523 } else if (word2
->base
> maxBase
) {
2524 maxBase
= word2
->base
;
2530 word1
= word1
->next
;
2536 // scan down the right side of the block, looking for words
2537 // that are near (but not overlapping) the block; if there are
2538 // three or fewer, add them to the block
2540 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
2541 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
2543 word1
= pool
->getPool(baseIdx
);
2545 if (word1
->base
>= minBase
- intraLineSpace
&&
2546 word1
->base
<= maxBase
+ intraLineSpace
&&
2547 ((rot
== 0 || rot
== 2)
2548 ? (word1
->xMin
>= blk
->xMax
&&
2549 word1
->xMin
< blk
->xMax
+ colSpace2
)
2550 : (word1
->yMin
>= blk
->yMax
&&
2551 word1
->yMin
< blk
->yMax
+ colSpace2
)) &&
2552 fabs(word1
->fontSize
- fontSize
) <
2553 maxBlockFontSizeDelta3
* fontSize
) {
2557 word1
= word1
->next
;
2560 if (n
> 0 && n
<= 3) {
2561 for (baseIdx
= pool
->getBaseIdx(minBase
- intraLineSpace
);
2562 baseIdx
<= pool
->getBaseIdx(maxBase
+ intraLineSpace
);
2565 word1
= pool
->getPool(baseIdx
);
2567 if (word1
->base
>= minBase
- intraLineSpace
&&
2568 word1
->base
<= maxBase
+ intraLineSpace
&&
2569 ((rot
== 0 || rot
== 2)
2570 ? (word1
->xMin
>= blk
->xMax
&&
2571 word1
->xMin
< blk
->xMax
+ colSpace2
)
2572 : (word1
->yMin
>= blk
->yMax
&&
2573 word1
->yMin
< blk
->yMax
+ colSpace2
)) &&
2574 fabs(word1
->fontSize
- fontSize
) <
2575 maxBlockFontSizeDelta3
* fontSize
) {
2578 word0
->next
= word1
->next
;
2580 pool
->setPool(baseIdx
, word1
->next
);
2582 word1
= word1
->next
;
2584 blk
->addWord(word2
);
2585 if (word2
->base
< minBase
) {
2586 minBase
= word2
->base
;
2587 } else if (word2
->base
> maxBase
) {
2588 maxBase
= word2
->base
;
2594 word1
= word1
->next
;
2602 //~ need to compute the primary writing mode (horiz/vert) in
2603 //~ addition to primary rotation
2605 // coalesce the block, and add it to the list
2606 blk
->coalesce(uMap
);
2608 lastBlk
->next
= blk
;
2613 count
[rot
] += blk
->charCount
;
2614 if (primaryRot
< 0 || count
[rot
] > count
[primaryRot
]) {
2621 #if 0 // for debugging
2622 printf("*** rotation ***\n");
2623 for (rot
= 0; rot
< 4; ++rot
) {
2624 printf(" %d: %6d\n", rot
, count
[rot
]);
2626 printf(" primary rot = %d\n", primaryRot
);
2630 #if 0 // for debugging
2631 printf("*** blocks ***\n");
2632 for (blk
= blkList
; blk
; blk
= blk
->next
) {
2633 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2634 blk
->rot
, blk
->xMin
, blk
->xMax
, blk
->yMin
, blk
->yMax
);
2635 for (line
= blk
->lines
; line
; line
= line
->next
) {
2636 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2637 line
->xMin
, line
->xMax
, line
->yMin
, line
->yMax
, line
->base
);
2638 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
2639 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2640 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
2641 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
2642 for (i
= 0; i
< word0
->len
; ++i
) {
2643 fputc(word0
->text
[i
] & 0xff, stdout
);
2652 // determine the primary direction
2654 for (blk
= blkList
; blk
; blk
= blk
->next
) {
2655 for (line
= blk
->lines
; line
; line
= line
->next
) {
2656 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
2657 for (i
= 0; i
< word0
->len
; ++i
) {
2658 if (unicodeTypeL(word0
->text
[i
])) {
2660 } else if (unicodeTypeR(word0
->text
[i
])) {
2667 primaryLR
= lrCount
>= 0;
2669 #if 0 // for debugging
2670 printf("*** direction ***\n");
2671 printf("lrCount = %d\n", lrCount
);
2672 printf("primaryLR = %d\n", primaryLR
);
2675 //----- column assignment
2677 // sort blocks into xy order for column assignment
2678 blocks
= (TextBlock
**)gmallocn(nBlocks
, sizeof(TextBlock
*));
2679 for (blk
= blkList
, i
= 0; blk
; blk
= blk
->next
, ++i
) {
2682 qsort(blocks
, nBlocks
, sizeof(TextBlock
*), &TextBlock::cmpXYPrimaryRot
);
2684 // column assignment
2685 for (i
= 0; i
< nBlocks
; ++i
) {
2688 for (j
= 0; j
< i
; ++j
) {
2690 col2
= 0; // make gcc happy
2691 switch (primaryRot
) {
2693 if (blk0
->xMin
> blk1
->xMax
) {
2694 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
2695 } else if (blk1
->xMax
== blk1
->xMin
) {
2698 col2
= blk1
->col
+ (int)(((blk0
->xMin
- blk1
->xMin
) /
2699 (blk1
->xMax
- blk1
->xMin
)) *
2704 if (blk0
->yMin
> blk1
->yMax
) {
2705 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
2706 } else if (blk1
->yMax
== blk1
->yMin
) {
2709 col2
= blk1
->col
+ (int)(((blk0
->yMin
- blk1
->yMin
) /
2710 (blk1
->yMax
- blk1
->yMin
)) *
2715 if (blk0
->xMax
< blk1
->xMin
) {
2716 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
2717 } else if (blk1
->xMin
== blk1
->xMax
) {
2720 col2
= blk1
->col
+ (int)(((blk0
->xMax
- blk1
->xMax
) /
2721 (blk1
->xMin
- blk1
->xMax
)) *
2726 if (blk0
->yMax
< blk1
->yMin
) {
2727 col2
= blk1
->col
+ blk1
->nColumns
+ 3;
2728 } else if (blk1
->yMin
== blk1
->yMax
) {
2731 col2
= blk1
->col
+ (int)(((blk0
->yMax
- blk1
->yMax
) /
2732 (blk1
->yMin
- blk1
->yMax
)) *
2742 for (line
= blk0
->lines
; line
; line
= line
->next
) {
2743 for (j
= 0; j
<= line
->len
; ++j
) {
2744 line
->col
[j
] += col1
;
2749 #if 0 // for debugging
2750 printf("*** blocks, after column assignment ***\n");
2751 for (blk
= blkList
; blk
; blk
= blk
->next
) {
2752 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2753 blk
->rot
, blk
->xMin
, blk
->xMax
, blk
->yMin
, blk
->yMax
, blk
->col
,
2755 for (line
= blk
->lines
; line
; line
= line
->next
) {
2757 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
2758 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2759 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
2760 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
2761 for (i
= 0; i
< word0
->len
; ++i
) {
2762 fputc(word0
->text
[i
] & 0xff, stdout
);
2771 //----- reading order sort
2773 // sort blocks into yx order (in preparation for reading order sort)
2774 qsort(blocks
, nBlocks
, sizeof(TextBlock
*), &TextBlock::cmpYXPrimaryRot
);
2776 // compute space on left and right sides of each block
2777 for (i
= 0; i
< nBlocks
; ++i
) {
2779 for (j
= 0; j
< nBlocks
; ++j
) {
2782 blk0
->updatePriMinMax(blk1
);
2787 #if 0 // for debugging
2788 printf("*** blocks, after yx sort ***\n");
2789 for (i
= 0; i
< nBlocks
; ++i
) {
2791 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2792 blk
->rot
, blk
->xMin
, blk
->xMax
, blk
->yMin
, blk
->yMax
,
2793 blk
->priMin
, blk
->priMax
);
2794 for (line
= blk
->lines
; line
; line
= line
->next
) {
2796 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
2797 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2798 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
2799 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
2800 for (j
= 0; j
< word0
->len
; ++j
) {
2801 fputc(word0
->text
[j
] & 0xff, stdout
);
2811 //~ this needs to be adjusted for writing mode (vertical text)
2812 //~ this also needs to account for right-to-left column ordering
2813 blkArray
= (TextBlock
**)gmallocn(nBlocks
, sizeof(TextBlock
*));
2814 memcpy(blkArray
, blocks
, nBlocks
* sizeof(TextBlock
*));
2815 flows
= lastFlow
= NULL
;
2817 nBlocksLeft
= nBlocks
;
2818 while (nBlocksLeft
> 0) {
2820 // find the upper-left-most block
2821 for (; !blkArray
[firstBlkIdx
]; ++firstBlkIdx
) ;
2824 for (j
= firstBlkIdx
+ 1; j
< nBlocks
; ++j
) {
2827 if (blk
&& blk
->secondaryDelta(blk1
) > 0) {
2830 if (blk1
->primaryCmp(blk
) < 0) {
2840 // create a new flow, starting with the upper-left-most block
2841 flow
= new TextFlow(this, blk
);
2843 lastFlow
->next
= flow
;
2848 fontSize
= blk
->lines
->words
->fontSize
;
2850 // push the upper-left-most block on the stack
2851 blk
->stackNext
= NULL
;
2854 // find the other blocks in this flow
2857 // find the upper-left-most block under (but within
2858 // maxBlockSpacing of) the top block on the stack
2859 blkSpace
= maxBlockSpacing
* blkStack
->lines
->words
->fontSize
;
2862 for (j
= firstBlkIdx
; j
< nBlocks
; ++j
) {
2865 if (blkStack
->secondaryDelta(blk1
) > blkSpace
) {
2868 if (blk
&& blk
->secondaryDelta(blk1
) > 0) {
2871 if (blk1
->isBelow(blkStack
) &&
2872 (!blk
|| blk1
->primaryCmp(blk
) < 0)) {
2879 // if a suitable block was found, add it to the flow and push it
2881 if (blk
&& flow
->blockFits(blk
, blkStack
)) {
2885 flow
->addBlock(blk
);
2886 fontSize
= blk
->lines
->words
->fontSize
;
2887 blk
->stackNext
= blkStack
;
2890 // otherwise (if there is no block under the top block or the
2891 // block is not suitable), pop the stack
2893 blkStack
= blkStack
->stackNext
;
2899 #if 0 // for debugging
2900 printf("*** flows ***\n");
2901 for (flow
= flows
; flow
; flow
= flow
->next
) {
2902 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2903 flow
->xMin
, flow
->xMax
, flow
->yMin
, flow
->yMax
,
2904 flow
->priMin
, flow
->priMax
);
2905 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
2906 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2907 blk
->rot
, blk
->xMin
, blk
->xMax
, blk
->yMin
, blk
->yMax
,
2908 blk
->priMin
, blk
->priMax
);
2909 for (line
= blk
->lines
; line
; line
= line
->next
) {
2911 for (word0
= line
->words
; word0
; word0
= word0
->next
) {
2912 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2913 word0
->xMin
, word0
->xMax
, word0
->yMin
, word0
->yMax
,
2914 word0
->base
, word0
->fontSize
, word0
->spaceAfter
);
2915 for (i
= 0; i
< word0
->len
; ++i
) {
2916 fputc(word0
->text
[i
] & 0xff, stdout
);
2931 GBool
TextPage::findText(Unicode
*s
, int len
,
2932 GBool startAtTop
, GBool stopAtBottom
,
2933 GBool startAtLast
, GBool stopAtLast
,
2934 GBool caseSensitive
, GBool backward
,
2935 double *xMin
, double *yMin
,
2936 double *xMax
, double *yMax
) {
2941 int txtSize
, m
, i
, j
, k
;
2942 double xStart
, yStart
, xStop
, yStop
;
2943 double xMin0
, yMin0
, xMax0
, yMax0
;
2944 double xMin1
, yMin1
, xMax1
, yMax1
;
2947 //~ needs to handle right-to-left text
2953 // convert the search string to uppercase
2954 if (!caseSensitive
) {
2955 s2
= (Unicode
*)gmallocn(len
, sizeof(Unicode
));
2956 for (i
= 0; i
< len
; ++i
) {
2957 s2
[i
] = unicodeToUpper(s
[i
]);
2966 xStart
= yStart
= xStop
= yStop
= 0;
2967 if (startAtLast
&& haveLastFind
) {
2968 xStart
= lastFindXMin
;
2969 yStart
= lastFindYMin
;
2970 } else if (!startAtTop
) {
2974 if (stopAtLast
&& haveLastFind
) {
2975 xStop
= lastFindXMin
;
2976 yStop
= lastFindYMin
;
2977 } else if (!stopAtBottom
) {
2983 xMin0
= xMax0
= yMin0
= yMax0
= 0; // make gcc happy
2984 xMin1
= xMax1
= yMin1
= yMax1
= 0; // make gcc happy
2986 for (i
= backward
? nBlocks
- 1 : 0;
2987 backward
? i
>= 0 : i
< nBlocks
;
2988 i
+= backward
? -1 : 1) {
2991 // check: is the block above the top limit?
2992 if (!startAtTop
&& (backward
? blk
->yMin
> yStart
: blk
->yMax
< yStart
)) {
2996 // check: is the block below the bottom limit?
2997 if (!stopAtBottom
&& (backward
? blk
->yMax
< yStop
: blk
->yMin
> yStop
)) {
3001 for (line
= blk
->lines
; line
; line
= line
->next
) {
3003 // check: is the line above the top limit?
3005 (backward
? line
->yMin
> yStart
: line
->yMin
< yStart
)) {
3009 // check: is the line below the bottom limit?
3010 if (!stopAtBottom
&&
3011 (backward
? line
->yMin
< yStop
: line
->yMin
> yStop
)) {
3015 // convert the line to uppercase
3017 if (!caseSensitive
) {
3019 txt
= (Unicode
*)greallocn(txt
, m
, sizeof(Unicode
));
3022 for (k
= 0; k
< m
; ++k
) {
3023 txt
[k
] = unicodeToUpper(line
->text
[k
]);
3029 // search each position in this line
3030 j
= backward
? m
- len
: 0;
3032 while (backward
? j
>= 0 : j
<= m
- len
) {
3034 // compare the strings
3035 for (k
= 0; k
< len
; ++k
) {
3036 if (p
[k
] != s2
[k
]) {
3043 switch (line
->rot
) {
3045 xMin1
= line
->edge
[j
];
3046 xMax1
= line
->edge
[j
+ len
];
3053 yMin1
= line
->edge
[j
];
3054 yMax1
= line
->edge
[j
+ len
];
3057 xMin1
= line
->edge
[j
+ len
];
3058 xMax1
= line
->edge
[j
];
3065 yMin1
= line
->edge
[j
+ len
];
3066 yMax1
= line
->edge
[j
];
3071 yMin1
< yStart
|| (yMin1
== yStart
&& xMin1
< xStart
)) &&
3073 yMin1
> yStop
|| (yMin1
== yStop
&& xMin1
> xStop
))) {
3075 yMin1
> yMin0
|| (yMin1
== yMin0
&& xMin1
> xMin0
)) {
3085 yMin1
> yStart
|| (yMin1
== yStart
&& xMin1
> xStart
)) &&
3087 yMin1
< yStop
|| (yMin1
== yStop
&& xMin1
< xStop
))) {
3089 yMin1
< yMin0
|| (yMin1
== yMin0
&& xMin1
< xMin0
)) {
3110 if (!caseSensitive
) {
3120 lastFindXMin
= xMin0
;
3121 lastFindYMin
= yMin0
;
3122 haveLastFind
= gTrue
;
3129 GString
*TextPage::getText(double xMin
, double yMin
,
3130 double xMax
, double yMax
) {
3133 GBool isUnicode __unused
;
3136 TextLineFrag
*frags
;
3137 int nFrags
, fragsSize
;
3139 char space
[8], eol
[16];
3140 int spaceLen
, eolLen
;
3143 int col
, idx0
, idx1
, i
, j
;
3144 GBool multiLine
, oneRot
;
3152 // get the output encoding
3153 if (!(uMap
= globalParams
->getTextEncoding())) {
3156 isUnicode
= uMap
->isUnicode();
3157 spaceLen
= uMap
->mapUnicode(0x20, space
, sizeof(space
));
3158 eolLen
= 0; // make gcc happy
3159 switch (globalParams
->getTextEOL()) {
3161 eolLen
= uMap
->mapUnicode(0x0a, eol
, sizeof(eol
));
3164 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
3165 eolLen
+= uMap
->mapUnicode(0x0a, eol
+ eolLen
, sizeof(eol
) - eolLen
);
3168 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
3172 //~ writing mode (horiz/vert)
3174 // collect the line fragments that are in the rectangle
3176 frags
= (TextLineFrag
*)gmallocn(fragsSize
, sizeof(TextLineFrag
));
3180 for (i
= 0; i
< nBlocks
; ++i
) {
3182 if (xMin
< blk
->xMax
&& blk
->xMin
< xMax
&&
3183 yMin
< blk
->yMax
&& blk
->yMin
< yMax
) {
3184 for (line
= blk
->lines
; line
; line
= line
->next
) {
3185 if (xMin
< line
->xMax
&& line
->xMin
< xMax
&&
3186 yMin
< line
->yMax
&& line
->yMin
< yMax
) {
3188 switch (line
->rot
) {
3190 y
= 0.5 * (line
->yMin
+ line
->yMax
);
3191 if (yMin
< y
&& y
< yMax
) {
3193 while (j
< line
->len
) {
3194 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > xMin
) {
3202 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < xMax
) {
3211 x
= 0.5 * (line
->xMin
+ line
->xMax
);
3212 if (xMin
< x
&& x
< xMax
) {
3214 while (j
< line
->len
) {
3215 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > yMin
) {
3223 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < yMax
) {
3232 y
= 0.5 * (line
->yMin
+ line
->yMax
);
3233 if (yMin
< y
&& y
< yMax
) {
3235 while (j
< line
->len
) {
3236 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < xMax
) {
3244 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > xMin
) {
3253 x
= 0.5 * (line
->xMin
+ line
->xMax
);
3254 if (xMin
< x
&& x
< xMax
) {
3256 while (j
< line
->len
) {
3257 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) < yMax
) {
3265 if (0.5 * (line
->edge
[j
] + line
->edge
[j
+1]) > yMin
) {
3274 if (idx0
>= 0 && idx1
>= 0) {
3275 if (nFrags
== fragsSize
) {
3277 frags
= (TextLineFrag
*)
3278 greallocn(frags
, fragsSize
, sizeof(TextLineFrag
));
3280 frags
[nFrags
].init(line
, idx0
, idx1
- idx0
+ 1);
3282 if (lastRot
>= 0 && line
->rot
!= lastRot
) {
3285 lastRot
= line
->rot
;
3292 // sort the fragments and generate the string
3295 for (i
= 0; i
< nFrags
; ++i
) {
3296 frags
[i
].computeCoords(oneRot
);
3298 assignColumns(frags
, nFrags
, oneRot
);
3300 // if all lines in the region have the same rotation, use it;
3301 // otherwise, use the page's primary rotation
3303 qsort(frags
, nFrags
, sizeof(TextLineFrag
),
3304 &TextLineFrag::cmpYXLineRot
);
3306 qsort(frags
, nFrags
, sizeof(TextLineFrag
),
3307 &TextLineFrag::cmpYXPrimaryRot
);
3310 while (i
< nFrags
) {
3311 delta
= maxIntraLineDelta
* frags
[i
].line
->words
->fontSize
;
3313 j
< nFrags
&& fabs(frags
[j
].base
- frags
[i
].base
) < delta
;
3315 qsort(frags
+ i
, j
- i
, sizeof(TextLineFrag
),
3316 oneRot
? &TextLineFrag::cmpXYColumnLineRot
3317 : &TextLineFrag::cmpXYColumnPrimaryRot
);
3323 for (i
= 0; i
< nFrags
; ++i
) {
3327 if (frag
->col
< col
||
3328 (i
> 0 && fabs(frag
->base
- frags
[i
-1].base
) >
3329 maxIntraLineDelta
* frags
[i
-1].line
->words
->fontSize
)) {
3330 s
->append(eol
, eolLen
);
3336 for (; col
< frag
->col
; ++col
) {
3337 s
->append(space
, spaceLen
);
3340 // get the fragment text
3341 col
+= dumpFragment(frag
->line
->text
+ frag
->start
, frag
->len
, uMap
, s
);
3345 s
->append(eol
, eolLen
);
3355 GBool
TextPage::findCharRange(int pos
, int length
,
3356 double *xMin
, double *yMin
,
3357 double *xMax
, double *yMax
) {
3361 double xMin0
, xMax0
, yMin0
, yMax0
;
3362 double xMin1
, xMax1
, yMin1
, yMax1
;
3370 //~ this doesn't correctly handle:
3371 //~ - ranges split across multiple lines (the highlighted region
3372 //~ is the bounding box of all the parts of the range)
3373 //~ - cases where characters don't convert one-to-one into Unicode
3375 xMin0
= xMax0
= yMin0
= yMax0
= 0; // make gcc happy
3376 xMin1
= xMax1
= yMin1
= yMax1
= 0; // make gcc happy
3377 for (i
= 0; i
< nBlocks
; ++i
) {
3379 for (line
= blk
->lines
; line
; line
= line
->next
) {
3380 for (word
= line
->words
; word
; word
= word
->next
) {
3381 if (pos
< word
->charPos
+ word
->charLen
&&
3382 word
->charPos
< pos
+ length
) {
3383 j0
= pos
- word
->charPos
;
3387 j1
= pos
+ length
- 1 - word
->charPos
;
3388 if (j1
>= word
->len
) {
3391 switch (line
->rot
) {
3393 xMin1
= word
->edge
[j0
];
3394 xMax1
= word
->edge
[j1
+ 1];
3401 yMin1
= word
->edge
[j0
];
3402 yMax1
= word
->edge
[j1
+ 1];
3405 xMin1
= word
->edge
[j1
+ 1];
3406 xMax1
= word
->edge
[j0
];
3413 yMin1
= word
->edge
[j1
+ 1];
3414 yMax1
= word
->edge
[j0
];
3417 if (first
|| xMin1
< xMin0
) {
3420 if (first
|| xMax1
> xMax0
) {
3423 if (first
|| yMin1
< yMin0
) {
3426 if (first
|| yMax1
> yMax0
) {
3444 void TextPage::dump(void *outputStream
, TextOutputFunc outputFunc
,
3450 TextLineFrag
*frags
;
3452 int nFrags
, fragsSize
;
3454 char space
[8], eol
[16], eop
[8];
3455 int spaceLen
, eolLen
, eopLen
;
3459 int col
, i
, j
, d
, n
;
3461 // get the output encoding
3462 if (!(uMap
= globalParams
->getTextEncoding())) {
3465 spaceLen
= uMap
->mapUnicode(0x20, space
, sizeof(space
));
3466 eolLen
= 0; // make gcc happy
3467 switch (globalParams
->getTextEOL()) {
3469 eolLen
= uMap
->mapUnicode(0x0a, eol
, sizeof(eol
));
3472 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
3473 eolLen
+= uMap
->mapUnicode(0x0a, eol
+ eolLen
, sizeof(eol
) - eolLen
);
3476 eolLen
= uMap
->mapUnicode(0x0d, eol
, sizeof(eol
));
3479 eopLen
= uMap
->mapUnicode(0x0c, eop
, sizeof(eop
));
3480 pageBreaks
= globalParams
->getTextPageBreaks();
3482 //~ writing mode (horiz/vert)
3484 // output the page in raw (content stream) order
3487 for (word
= rawWords
; word
; word
= word
->next
) {
3489 dumpFragment(word
->text
, word
->len
, uMap
, s
);
3490 (*outputFunc
)(outputStream
, s
->getCString(), s
->getLength());
3493 fabs(word
->next
->base
- word
->base
) <
3494 maxIntraLineDelta
* word
->fontSize
) {
3495 if (word
->next
->xMin
> word
->xMax
+ minWordSpacing
* word
->fontSize
) {
3496 (*outputFunc
)(outputStream
, space
, spaceLen
);
3499 (*outputFunc
)(outputStream
, eol
, eolLen
);
3503 // output the page, maintaining the original physical layout
3504 } else if (physLayout
) {
3506 // collect the line fragments for the page and sort them
3508 frags
= (TextLineFrag
*)gmallocn(fragsSize
, sizeof(TextLineFrag
));
3510 for (i
= 0; i
< nBlocks
; ++i
) {
3512 for (line
= blk
->lines
; line
; line
= line
->next
) {
3513 if (nFrags
== fragsSize
) {
3515 frags
= (TextLineFrag
*)greallocn(frags
,
3516 fragsSize
, sizeof(TextLineFrag
));
3518 frags
[nFrags
].init(line
, 0, line
->len
);
3519 frags
[nFrags
].computeCoords(gTrue
);
3523 qsort(frags
, nFrags
, sizeof(TextLineFrag
), &TextLineFrag::cmpYXPrimaryRot
);
3525 while (i
< nFrags
) {
3526 delta
= maxIntraLineDelta
* frags
[i
].line
->words
->fontSize
;
3528 j
< nFrags
&& fabs(frags
[j
].base
- frags
[i
].base
) < delta
;
3530 qsort(frags
+ i
, j
- i
, sizeof(TextLineFrag
),
3531 &TextLineFrag::cmpXYColumnPrimaryRot
);
3535 #if 0 // for debugging
3536 printf("*** line fragments ***\n");
3537 for (i
= 0; i
< nFrags
; ++i
) {
3539 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
3540 frag
->xMin
, frag
->xMax
, frag
->yMin
, frag
->yMax
, frag
->base
);
3541 for (n
= 0; n
< frag
->len
; ++n
) {
3542 fputc(frag
->line
->text
[frag
->start
+ n
] & 0xff, stdout
);
3551 for (i
= 0; i
< nFrags
; ++i
) {
3555 for (; col
< frag
->col
; ++col
) {
3556 (*outputFunc
)(outputStream
, space
, spaceLen
);
3561 col
+= dumpFragment(frag
->line
->text
+ frag
->start
, frag
->len
, uMap
, s
);
3562 (*outputFunc
)(outputStream
, s
->getCString(), s
->getLength());
3565 // print one or more returns if necessary
3566 if (i
== nFrags
- 1 ||
3567 frags
[i
+1].col
< col
||
3568 fabs(frags
[i
+1].base
- frag
->base
) >
3569 maxIntraLineDelta
* frag
->line
->words
->fontSize
) {
3570 if (i
< nFrags
- 1) {
3571 d
= (int)((frags
[i
+1].base
- frag
->base
) /
3572 frag
->line
->words
->fontSize
);
3581 for (; d
> 0; --d
) {
3582 (*outputFunc
)(outputStream
, eol
, eolLen
);
3590 // output the page, "undoing" the layout
3592 for (flow
= flows
; flow
; flow
= flow
->next
) {
3593 for (blk
= flow
->blocks
; blk
; blk
= blk
->next
) {
3594 for (line
= blk
->lines
; line
; line
= line
->next
) {
3596 if (line
->hyphenated
&& (line
->next
|| blk
->next
)) {
3600 dumpFragment(line
->text
, n
, uMap
, s
);
3601 (*outputFunc
)(outputStream
, s
->getCString(), s
->getLength());
3603 if (!line
->hyphenated
) {
3605 (*outputFunc
)(outputStream
, space
, spaceLen
);
3606 } else if (blk
->next
) {
3607 //~ this is a bit of a kludge - we should really do a more
3608 //~ intelligent determination of paragraphs
3609 if (blk
->next
->lines
->words
->fontSize
==
3610 blk
->lines
->words
->fontSize
) {
3611 (*outputFunc
)(outputStream
, space
, spaceLen
);
3613 (*outputFunc
)(outputStream
, eol
, eolLen
);
3619 (*outputFunc
)(outputStream
, eol
, eolLen
);
3620 (*outputFunc
)(outputStream
, eol
, eolLen
);
3626 (*outputFunc
)(outputStream
, eop
, eopLen
);
3632 void TextPage::assignColumns(TextLineFrag
*frags
, int nFrags
, GBool oneRot
) {
3633 TextLineFrag
*frag0
, *frag1
;
3634 int rot
, col1
, col2
, i
, j
, k
;
3636 // all text in the region has the same rotation -- recompute the
3637 // column numbers based only on the text in the region
3639 qsort(frags
, nFrags
, sizeof(TextLineFrag
), &TextLineFrag::cmpXYLineRot
);
3640 rot
= frags
[0].line
->rot
;
3641 for (i
= 0; i
< nFrags
; ++i
) {
3644 for (j
= 0; j
< i
; ++j
) {
3646 col2
= 0; // make gcc happy
3649 if (frag0
->xMin
>= frag1
->xMax
) {
3650 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
3651 frag1
->line
->col
[frag1
->start
]) + 1;
3653 for (k
= frag1
->start
;
3654 k
< frag1
->start
+ frag1
->len
&&
3655 frag0
->xMin
>= 0.5 * (frag1
->line
->edge
[k
] +
3656 frag1
->line
->edge
[k
+1]);
3659 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
3663 if (frag0
->yMin
>= frag1
->yMax
) {
3664 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
3665 frag1
->line
->col
[frag1
->start
]) + 1;
3667 for (k
= frag1
->start
;
3668 k
< frag1
->start
+ frag1
->len
&&
3669 frag0
->yMin
>= 0.5 * (frag1
->line
->edge
[k
] +
3670 frag1
->line
->edge
[k
+1]);
3673 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
3677 if (frag0
->xMax
<= frag1
->xMin
) {
3678 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
3679 frag1
->line
->col
[frag1
->start
]) + 1;
3681 for (k
= frag1
->start
;
3682 k
< frag1
->start
+ frag1
->len
&&
3683 frag0
->xMax
<= 0.5 * (frag1
->line
->edge
[k
] +
3684 frag1
->line
->edge
[k
+1]);
3687 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
3691 if (frag0
->yMax
<= frag1
->yMin
) {
3692 col2
= frag1
->col
+ (frag1
->line
->col
[frag1
->start
+ frag1
->len
] -
3693 frag1
->line
->col
[frag1
->start
]) + 1;
3695 for (k
= frag1
->start
;
3696 k
< frag1
->start
+ frag1
->len
&&
3697 frag0
->yMax
<= 0.5 * (frag1
->line
->edge
[k
] +
3698 frag1
->line
->edge
[k
+1]);
3701 frag1
->line
->col
[k
] - frag1
->line
->col
[frag1
->start
];
3712 // the region includes text at different rotations -- use the
3713 // globally assigned column numbers, offset by the minimum column
3714 // number (i.e., shift everything over to column 0)
3716 col1
= frags
[0].col
;
3717 for (i
= 1; i
< nFrags
; ++i
) {
3718 if (frags
[i
].col
< col1
) {
3719 col1
= frags
[i
].col
;
3722 for (i
= 0; i
< nFrags
; ++i
) {
3723 frags
[i
].col
-= col1
;
3728 int TextPage::dumpFragment(Unicode
*text
, int len
, UnicodeMap
*uMap
,
3730 char lre
[8], rle
[8], popdf
[8], buf
[8];
3731 int lreLen
, rleLen
, popdfLen
, n
;
3736 if (uMap
->isUnicode()) {
3738 lreLen
= uMap
->mapUnicode(0x202a, lre
, sizeof(lre
));
3739 rleLen
= uMap
->mapUnicode(0x202b, rle
, sizeof(rle
));
3740 popdfLen
= uMap
->mapUnicode(0x202c, popdf
, sizeof(popdf
));
3746 // output a left-to-right section
3747 for (j
= i
; j
< len
&& !unicodeTypeR(text
[j
]); ++j
) ;
3748 for (k
= i
; k
< j
; ++k
) {
3749 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
3754 // output a right-to-left section
3755 for (j
= i
; j
< len
&& !unicodeTypeL(text
[j
]); ++j
) ;
3757 s
->append(rle
, rleLen
);
3758 for (k
= j
- 1; k
>= i
; --k
) {
3759 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
3763 s
->append(popdf
, popdfLen
);
3770 s
->append(rle
, rleLen
);
3773 // output a right-to-left section
3774 for (j
= i
; j
>= 0 && !unicodeTypeL(text
[j
]); --j
) ;
3775 for (k
= i
; k
> j
; --k
) {
3776 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
3781 // output a left-to-right section
3782 for (j
= i
; j
>= 0 && !unicodeTypeR(text
[j
]); --j
) ;
3784 s
->append(lre
, lreLen
);
3785 for (k
= j
+ 1; k
<= i
; ++k
) {
3786 n
= uMap
->mapUnicode(text
[k
], buf
, sizeof(buf
));
3790 s
->append(popdf
, popdfLen
);
3794 s
->append(popdf
, popdfLen
);
3799 for (i
= 0; i
< len
; ++i
) {
3800 n
= uMap
->mapUnicode(text
[i
], buf
, sizeof(buf
));
3809 #if TEXTOUT_WORD_LIST
3810 TextWordList
*TextPage::makeWordList(GBool physLayout
) {
3811 return new TextWordList(this, physLayout
);
3815 //------------------------------------------------------------------------
3817 //------------------------------------------------------------------------
3819 static void outputToFile(void *stream
, char *text
, int len
) {
3820 fwrite(text
, 1, len
, (FILE *)stream
);
3823 TextOutputDev::TextOutputDev(char *fileName
, GBool physLayoutA
,
3824 GBool rawOrderA
, GBool append
) {
3826 physLayout
= physLayoutA
;
3827 rawOrder
= rawOrderA
;
3834 if (!strcmp(fileName
, "-")) {
3835 outputStream
= stdout
;
3837 // keep DOS from munging the end-of-line characters
3838 setmode(fileno(stdout
), O_BINARY
);
3840 } else if ((outputStream
= fopen(fileName
, append
? "ab" : "wb"))) {
3843 error(-1, "Couldn't open text file '%s'", fileName
);
3847 outputFunc
= &outputToFile
;
3849 outputStream
= NULL
;
3852 // set up text xObject
3853 text
= new TextPage(rawOrderA
);
3856 TextOutputDev::TextOutputDev(TextOutputFunc func
, void *stream
,
3857 GBool physLayoutA
, GBool rawOrderA
) {
3859 outputStream
= stream
;
3861 physLayout
= physLayoutA
;
3862 rawOrder
= rawOrderA
;
3864 text
= new TextPage(rawOrderA
);
3868 TextOutputDev::~TextOutputDev() {
3871 ICS_MapRefNumAndAssign((short)((FILE *)outputStream
)->handle
);
3873 fclose((FILE *)outputStream
);
3880 void TextOutputDev::startPage(int pageNum
, GfxState
*state
) {
3881 text
->startPage(state
);
3884 void TextOutputDev::endPage() {
3886 text
->coalesce(physLayout
, doHTML
);
3888 text
->dump(outputStream
, outputFunc
, physLayout
);
3892 void TextOutputDev::updateFont(GfxState
*state
) {
3893 text
->updateFont(state
);
3896 void TextOutputDev::beginString(GfxState
*state
, GString
*s
) {
3899 void TextOutputDev::endString(GfxState
*state
) {
3902 void TextOutputDev::drawChar(GfxState
*state
, double x
, double y
,
3903 double dx
, double dy
,
3904 double originX
, double originY
,
3905 CharCode c
, int nBytes
, Unicode
*u
, int uLen
) {
3906 text
->addChar(state
, x
, y
, dx
, dy
, c
, nBytes
, u
, uLen
);
3909 void TextOutputDev::stroke(GfxState
*state
) {
3911 GfxSubpath
*subpath
;
3917 path
= state
->getPath();
3918 if (path
->getNumSubpaths() != 1) {
3921 subpath
= path
->getSubpath(0);
3922 if (subpath
->getNumPoints() != 2) {
3925 state
->transform(subpath
->getX(0), subpath
->getY(0), &x
[0], &y
[0]);
3926 state
->transform(subpath
->getX(1), subpath
->getY(1), &x
[1], &y
[1]);
3928 // look for a vertical or horizontal line
3929 if (x
[0] == x
[1] || y
[0] == y
[1]) {
3930 text
->addUnderline(x
[0], y
[0], x
[1], y
[1]);
3934 void TextOutputDev::fill(GfxState
*state
) {
3936 GfxSubpath
*subpath
;
3938 double rx0
, ry0
, rx1
, ry1
, t
;
3944 path
= state
->getPath();
3945 if (path
->getNumSubpaths() != 1) {
3948 subpath
= path
->getSubpath(0);
3949 if (subpath
->getNumPoints() != 5) {
3952 for (i
= 0; i
< 5; ++i
) {
3953 if (subpath
->getCurve(i
)) {
3956 state
->transform(subpath
->getX(i
), subpath
->getY(i
), &x
[i
], &y
[i
]);
3959 // look for a rectangle
3960 if (x
[0] == x
[1] && y
[1] == y
[2] && x
[2] == x
[3] && y
[3] == y
[4] &&
3961 x
[0] == x
[4] && y
[0] == y
[4]) {
3966 } else if (y
[0] == y
[1] && x
[1] == x
[2] && y
[2] == y
[3] && x
[3] == x
[4] &&
3967 x
[0] == x
[4] && y
[0] == y
[4]) {
3986 // skinny horizontal rectangle
3987 if (ry1
- ry0
< rx1
- rx0
) {
3988 if (ry1
- ry0
< maxUnderlineWidth
) {
3989 ry0
= 0.5 * (ry0
+ ry1
);
3990 text
->addUnderline(rx0
, ry0
, rx1
, ry0
);
3993 // skinny vertical rectangle
3995 if (rx1
- rx0
< maxUnderlineWidth
) {
3996 rx0
= 0.5 * (rx0
+ rx1
);
3997 text
->addUnderline(rx0
, ry0
, rx0
, ry1
);
4002 void TextOutputDev::eoFill(GfxState
*state
) {
4009 void TextOutputDev::processLink(Link
*link
, Catalog
*catalog
) {
4010 double x1
, y1
, x2
, y2
;
4011 int xMin
, yMin
, xMax
, yMax
, x
, y
;
4016 link
->getRect(&x1
, &y1
, &x2
, &y2
);
4017 cvtUserToDev(x1
, y1
, &x
, &y
);
4020 cvtUserToDev(x1
, y2
, &x
, &y
);
4023 } else if (x
> xMax
) {
4028 } else if (y
> yMax
) {
4031 cvtUserToDev(x2
, y1
, &x
, &y
);
4034 } else if (x
> xMax
) {
4039 } else if (y
> yMax
) {
4042 cvtUserToDev(x2
, y2
, &x
, &y
);
4045 } else if (x
> xMax
) {
4050 } else if (y
> yMax
) {
4053 text
->addLink(xMin
, yMin
, xMax
, yMax
, link
);
4056 GBool
TextOutputDev::findText(Unicode
*s
, int len
,
4057 GBool startAtTop
, GBool stopAtBottom
,
4058 GBool startAtLast
, GBool stopAtLast
,
4059 GBool caseSensitive
, GBool backward
,
4060 double *xMin
, double *yMin
,
4061 double *xMax
, double *yMax
) {
4062 return text
->findText(s
, len
, startAtTop
, stopAtBottom
,
4063 startAtLast
, stopAtLast
, caseSensitive
, backward
,
4064 xMin
, yMin
, xMax
, yMax
);
4067 GString
*TextOutputDev::getText(double xMin
, double yMin
,
4068 double xMax
, double yMax
) {
4069 return text
->getText(xMin
, yMin
, xMax
, yMax
);
4072 GBool
TextOutputDev::findCharRange(int pos
, int length
,
4073 double *xMin
, double *yMin
,
4074 double *xMax
, double *yMax
) {
4075 return text
->findCharRange(pos
, length
, xMin
, yMin
, xMax
, yMax
);
4078 #if TEXTOUT_WORD_LIST
4079 TextWordList
*TextOutputDev::makeWordList() {
4080 return text
->makeWordList(physLayout
);
4084 TextPage
*TextOutputDev::takeText() {
4088 text
= new TextPage(rawOrder
);