i hope the node casts are correct here.
[AROS-Contrib.git] / arospdf / xpdf / TextOutputDev.cc
blob880cad3835e816d11b2eaee2610ba1b601fc5a2f
1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
9 #include <aconf.h>
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
13 #endif
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <stddef.h>
18 #include <math.h>
19 #include <ctype.h>
20 #ifdef WIN32
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h> // for setmode
23 #endif
24 #include "gmem.h"
25 #include "GString.h"
26 #include "GList.h"
27 #include "config.h"
28 #include "Error.h"
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
32 #include "GfxState.h"
33 #include "Link.h"
34 #include "TextOutputDev.h"
36 #ifdef MACOS
37 // needed for setting type/creator of MacOS files
38 #include "ICSupport.h"
39 #endif
41 //------------------------------------------------------------------------
42 // parameters
43 //------------------------------------------------------------------------
45 // Each bucket in a text pool includes baselines within a range of
46 // this many points.
47 #define textPoolStep 4
49 // Inter-character space width which will cause addChar to start a new
50 // word.
51 #define minWordBreakSpace 0.1
53 // Negative inter-character space width, i.e., overlap, which will
54 // cause addChar to start a new word.
55 #define minDupBreakOverlap 0.2
57 // Max distance between baselines of two lines within a block, as a
58 // fraction of the font size.
59 #define maxLineSpacingDelta 1.5
61 // Max difference in primary font sizes on two lines in the same
62 // block. Delta1 is used when examining new lines above and below the
63 // current block; delta2 is used when examining text that overlaps the
64 // current block; delta3 is used when examining text to the left and
65 // right of the current block.
66 #define maxBlockFontSizeDelta1 0.05
67 #define maxBlockFontSizeDelta2 0.6
68 #define maxBlockFontSizeDelta3 0.2
70 // Max difference in font sizes inside a word.
71 #define maxWordFontSizeDelta 0.05
73 // Maximum distance between baselines of two words on the same line,
74 // e.g., distance between subscript or superscript and the primary
75 // baseline, as a fraction of the font size.
76 #define maxIntraLineDelta 0.5
78 // Minimum inter-word spacing, as a fraction of the font size. (Only
79 // used for raw ordering.)
80 #define minWordSpacing 0.15
82 // Maximum inter-word spacing, as a fraction of the font size.
83 #define maxWordSpacing 1.5
85 // Maximum horizontal spacing which will allow a word to be pulled
86 // into a block.
87 #define minColSpacing1 0.3
89 // Minimum spacing between columns, as a fraction of the font size.
90 #define minColSpacing2 1.0
92 // Maximum vertical spacing between blocks within a flow, as a
93 // multiple of the font size.
94 #define maxBlockSpacing 2.5
96 // Minimum spacing between characters within a word, as a fraction of
97 // the font size.
98 #define minCharSpacing -0.2
100 // Maximum spacing between characters within a word, as a fraction of
101 // the font size, when there is no obvious extra-wide character
102 // spacing.
103 #define maxCharSpacing 0.03
105 // When extra-wide character spacing is detected, the inter-character
106 // space threshold is set to the minimum inter-character space
107 // multiplied by this constant.
108 #define maxWideCharSpacingMul 1.3
110 // Upper limit on spacing between characters in a word.
111 #define maxWideCharSpacing 0.4
113 // Max difference in primary,secondary coordinates (as a fraction of
114 // the font size) allowed for duplicated text (fake boldface, drop
115 // shadows) which is to be discarded.
116 #define dupMaxPriDelta 0.1
117 #define dupMaxSecDelta 0.2
119 // Max width of underlines (in points).
120 #define maxUnderlineWidth 3
122 // Min distance between baseline and underline (in points).
123 //~ this should be font-size-dependent
124 #define minUnderlineGap -2
126 // Max distance between baseline and underline (in points).
127 //~ this should be font-size-dependent
128 #define maxUnderlineGap 4
130 // Max horizontal distance between edge of word and start of underline
131 // (in points).
132 //~ this should be font-size-dependent
133 #define underlineSlack 1
135 // Max distance between edge of text and edge of link border
136 #define hyperlinkSlack 2
138 //------------------------------------------------------------------------
139 // TextUnderline
140 //------------------------------------------------------------------------
142 class TextUnderline {
143 public:
145 TextUnderline(double x0A, double y0A, double x1A, double y1A)
146 { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
147 ~TextUnderline() {}
149 double x0, y0, x1, y1;
150 GBool horiz;
153 //------------------------------------------------------------------------
154 // TextLink
155 //------------------------------------------------------------------------
157 class TextLink {
158 public:
160 TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, Link *linkA)
161 { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; }
162 ~TextLink() {}
164 int xMin, yMin, xMax, yMax;
165 Link *link;
168 //------------------------------------------------------------------------
169 // TextFontInfo
170 //------------------------------------------------------------------------
172 TextFontInfo::TextFontInfo(GfxState *state) {
173 gfxFont = state->getFont();
174 #if TEXTOUT_WORD_LIST
175 fontName = (gfxFont && gfxFont->getOrigName())
176 ? gfxFont->getOrigName()->copy()
177 : (GString *)NULL;
178 flags = gfxFont ? gfxFont->getFlags() : 0;
179 #endif
182 TextFontInfo::~TextFontInfo() {
183 #if TEXTOUT_WORD_LIST
184 if (fontName) {
185 delete fontName;
187 #endif
190 GBool TextFontInfo::matches(GfxState *state) {
191 return state->getFont() == gfxFont;
194 //------------------------------------------------------------------------
195 // TextWord
196 //------------------------------------------------------------------------
198 TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
199 int charPosA, TextFontInfo *fontA, double fontSizeA) {
200 GfxFont *gfxFont;
201 double x, y, ascent, descent;
203 rot = rotA;
204 charPos = charPosA;
205 charLen = 0;
206 font = fontA;
207 fontSize = fontSizeA;
208 state->transform(x0, y0, &x, &y);
209 if ((gfxFont = font->gfxFont)) {
210 ascent = gfxFont->getAscent() * fontSize;
211 descent = gfxFont->getDescent() * fontSize;
212 } else {
213 // this means that the PDF file draws text without a current font,
214 // which should never happen
215 ascent = 0.95 * fontSize;
216 descent = -0.35 * fontSize;
218 switch (rot) {
219 case 0:
220 yMin = y - ascent;
221 yMax = y - descent;
222 if (yMin == yMax) {
223 // this is a sanity check for a case that shouldn't happen -- but
224 // if it does happen, we want to avoid dividing by zero later
225 yMin = y;
226 yMax = y + 1;
228 base = y;
229 break;
230 case 1:
231 xMin = x + descent;
232 xMax = x + ascent;
233 if (xMin == xMax) {
234 // this is a sanity check for a case that shouldn't happen -- but
235 // if it does happen, we want to avoid dividing by zero later
236 xMin = x;
237 xMax = x + 1;
239 base = x;
240 break;
241 case 2:
242 yMin = y + descent;
243 yMax = y + ascent;
244 if (yMin == yMax) {
245 // this is a sanity check for a case that shouldn't happen -- but
246 // if it does happen, we want to avoid dividing by zero later
247 yMin = y;
248 yMax = y + 1;
250 base = y;
251 break;
252 case 3:
253 xMin = x - ascent;
254 xMax = x - descent;
255 if (xMin == xMax) {
256 // this is a sanity check for a case that shouldn't happen -- but
257 // if it does happen, we want to avoid dividing by zero later
258 xMin = x;
259 xMax = x + 1;
261 base = x;
262 break;
264 text = NULL;
265 edge = NULL;
266 len = size = 0;
267 spaceAfter = gFalse;
268 next = NULL;
270 #if TEXTOUT_WORD_LIST
271 GfxRGB rgb;
273 if ((state->getRender() & 3) == 1) {
274 state->getStrokeRGB(&rgb);
275 } else {
276 state->getFillRGB(&rgb);
278 colorR = colToDbl(rgb.r);
279 colorG = colToDbl(rgb.g);
280 colorB = colToDbl(rgb.b);
281 #endif
283 underlined = gFalse;
284 link = NULL;
287 TextWord::~TextWord() {
288 gfree(text);
289 gfree(edge);
292 void TextWord::addChar(GfxState *state, double x, double y,
293 double dx, double dy, Unicode u) {
294 if (len == size) {
295 size += 16;
296 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
297 edge = (double *)greallocn(edge, size + 1, sizeof(double));
299 text[len] = u;
300 switch (rot) {
301 case 0:
302 if (len == 0) {
303 xMin = x;
305 edge[len] = x;
306 xMax = edge[len+1] = x + dx;
307 break;
308 case 1:
309 if (len == 0) {
310 yMin = y;
312 edge[len] = y;
313 yMax = edge[len+1] = y + dy;
314 break;
315 case 2:
316 if (len == 0) {
317 xMax = x;
319 edge[len] = x;
320 xMin = edge[len+1] = x + dx;
321 break;
322 case 3:
323 if (len == 0) {
324 yMax = y;
326 edge[len] = y;
327 yMin = edge[len+1] = y + dy;
328 break;
330 ++len;
333 void TextWord::merge(TextWord *word) {
334 int i;
336 if (word->xMin < xMin) {
337 xMin = word->xMin;
339 if (word->yMin < yMin) {
340 yMin = word->yMin;
342 if (word->xMax > xMax) {
343 xMax = word->xMax;
345 if (word->yMax > yMax) {
346 yMax = word->yMax;
348 if (len + word->len > size) {
349 size = len + word->len;
350 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
351 edge = (double *)greallocn(edge, size + 1, sizeof(double));
353 for (i = 0; i < word->len; ++i) {
354 text[len + i] = word->text[i];
355 edge[len + i] = word->edge[i];
357 edge[len + word->len] = word->edge[word->len];
358 len += word->len;
359 charLen += word->charLen;
362 inline int TextWord::primaryCmp(TextWord *word) {
363 double cmp;
365 cmp = 0; // make gcc happy
366 switch (rot) {
367 case 0:
368 cmp = xMin - word->xMin;
369 break;
370 case 1:
371 cmp = yMin - word->yMin;
372 break;
373 case 2:
374 cmp = word->xMax - xMax;
375 break;
376 case 3:
377 cmp = word->yMax - yMax;
378 break;
380 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
383 double TextWord::primaryDelta(TextWord *word) {
384 double delta;
386 delta = 0; // make gcc happy
387 switch (rot) {
388 case 0:
389 delta = word->xMin - xMax;
390 break;
391 case 1:
392 delta = word->yMin - yMax;
393 break;
394 case 2:
395 delta = xMin - word->xMax;
396 break;
397 case 3:
398 delta = yMin - word->yMax;
399 break;
401 return delta;
404 int TextWord::cmpYX(const void *p1, const void *p2) {
405 TextWord *word1 = *(TextWord **)p1;
406 TextWord *word2 = *(TextWord **)p2;
407 double cmp;
409 cmp = word1->yMin - word2->yMin;
410 if (cmp == 0) {
411 cmp = word1->xMin - word2->xMin;
413 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
416 #if TEXTOUT_WORD_LIST
418 GString *TextWord::getText() {
419 GString *s;
420 UnicodeMap *uMap;
421 char buf[8];
422 int n, i;
424 s = new GString();
425 if (!(uMap = globalParams->getTextEncoding())) {
426 return s;
428 for (i = 0; i < len; ++i) {
429 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
430 s->append(buf, n);
432 uMap->decRefCnt();
433 return s;
436 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
437 double *xMaxA, double *yMaxA) {
438 if (charIdx < 0 || charIdx >= len) {
439 return;
441 switch (rot) {
442 case 0:
443 *xMinA = edge[charIdx];
444 *xMaxA = edge[charIdx + 1];
445 *yMinA = yMin;
446 *yMaxA = yMax;
447 break;
448 case 1:
449 *xMinA = xMin;
450 *xMaxA = xMax;
451 *yMinA = edge[charIdx];
452 *yMaxA = edge[charIdx + 1];
453 break;
454 case 2:
455 *xMinA = edge[charIdx + 1];
456 *xMaxA = edge[charIdx];
457 *yMinA = yMin;
458 *yMaxA = yMax;
459 break;
460 case 3:
461 *xMinA = xMin;
462 *xMaxA = xMax;
463 *yMinA = edge[charIdx + 1];
464 *yMaxA = edge[charIdx];
465 break;
469 #endif // TEXTOUT_WORD_LIST
471 //------------------------------------------------------------------------
472 // TextPool
473 //------------------------------------------------------------------------
475 TextPool::TextPool() {
476 minBaseIdx = 0;
477 maxBaseIdx = -1;
478 pool = NULL;
479 cursor = NULL;
480 cursorBaseIdx = -1;
483 TextPool::~TextPool() {
484 int baseIdx;
485 TextWord *word, *word2;
487 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
488 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
489 word2 = word->next;
490 delete word;
493 gfree(pool);
496 int TextPool::getBaseIdx(double base) {
497 int baseIdx;
499 baseIdx = (int)(base / textPoolStep);
500 if (baseIdx < minBaseIdx) {
501 return minBaseIdx;
503 if (baseIdx > maxBaseIdx) {
504 return maxBaseIdx;
506 return baseIdx;
509 void TextPool::addWord(TextWord *word) {
510 TextWord **newPool;
511 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
512 TextWord *w0, *w1;
514 // expand the array if needed
515 wordBaseIdx = (int)(word->base / textPoolStep);
516 if (minBaseIdx > maxBaseIdx) {
517 minBaseIdx = wordBaseIdx - 128;
518 maxBaseIdx = wordBaseIdx + 128;
519 pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
520 sizeof(TextWord *));
521 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
522 pool[baseIdx - minBaseIdx] = NULL;
524 } else if (wordBaseIdx < minBaseIdx) {
525 newMinBaseIdx = wordBaseIdx - 128;
526 newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
527 sizeof(TextWord *));
528 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
529 newPool[baseIdx - newMinBaseIdx] = NULL;
531 memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
532 (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
533 gfree(pool);
534 pool = newPool;
535 minBaseIdx = newMinBaseIdx;
536 } else if (wordBaseIdx > maxBaseIdx) {
537 newMaxBaseIdx = wordBaseIdx + 128;
538 pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
539 sizeof(TextWord *));
540 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
541 pool[baseIdx - minBaseIdx] = NULL;
543 maxBaseIdx = newMaxBaseIdx;
546 // insert the new word
547 if (cursor && wordBaseIdx == cursorBaseIdx &&
548 word->primaryCmp(cursor) > 0) {
549 w0 = cursor;
550 w1 = cursor->next;
551 } else {
552 w0 = NULL;
553 w1 = pool[wordBaseIdx - minBaseIdx];
555 for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
556 word->next = w1;
557 if (w0) {
558 w0->next = word;
559 } else {
560 pool[wordBaseIdx - minBaseIdx] = word;
562 cursor = word;
563 cursorBaseIdx = wordBaseIdx;
566 //------------------------------------------------------------------------
567 // TextLine
568 //------------------------------------------------------------------------
570 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
571 blk = blkA;
572 rot = rotA;
573 xMin = yMin = 0;
574 xMax = yMax = -1;
575 base = baseA;
576 words = lastWord = NULL;
577 text = NULL;
578 edge = NULL;
579 col = NULL;
580 len = 0;
581 convertedLen = 0;
582 hyphenated = gFalse;
583 next = NULL;
586 TextLine::~TextLine() {
587 TextWord *word;
589 while (words) {
590 word = words;
591 words = words->next;
592 delete word;
594 gfree(text);
595 gfree(edge);
596 gfree(col);
599 void TextLine::addWord(TextWord *word) {
600 if (lastWord) {
601 lastWord->next = word;
602 } else {
603 words = word;
605 lastWord = word;
607 if (xMin > xMax) {
608 xMin = word->xMin;
609 xMax = word->xMax;
610 yMin = word->yMin;
611 yMax = word->yMax;
612 } else {
613 if (word->xMin < xMin) {
614 xMin = word->xMin;
616 if (word->xMax > xMax) {
617 xMax = word->xMax;
619 if (word->yMin < yMin) {
620 yMin = word->yMin;
622 if (word->yMax > yMax) {
623 yMax = word->yMax;
628 double TextLine::primaryDelta(TextLine *line) {
629 double delta;
631 delta = 0; // make gcc happy
632 switch (rot) {
633 case 0:
634 delta = line->xMin - xMax;
635 break;
636 case 1:
637 delta = line->yMin - yMax;
638 break;
639 case 2:
640 delta = xMin - line->xMax;
641 break;
642 case 3:
643 delta = yMin - line->yMax;
644 break;
646 return delta;
649 int TextLine::primaryCmp(TextLine *line) {
650 double cmp;
652 cmp = 0; // make gcc happy
653 switch (rot) {
654 case 0:
655 cmp = xMin - line->xMin;
656 break;
657 case 1:
658 cmp = yMin - line->yMin;
659 break;
660 case 2:
661 cmp = line->xMax - xMax;
662 break;
663 case 3:
664 cmp = line->yMax - yMax;
665 break;
667 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
670 int TextLine::secondaryCmp(TextLine *line) {
671 double cmp;
673 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
674 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
677 int TextLine::cmpYX(TextLine *line) {
678 int cmp;
680 if ((cmp = secondaryCmp(line))) {
681 return cmp;
683 return primaryCmp(line);
686 int TextLine::cmpXY(const void *p1, const void *p2) {
687 TextLine *line1 = *(TextLine **)p1;
688 TextLine *line2 = *(TextLine **)p2;
689 int cmp;
691 if ((cmp = line1->primaryCmp(line2))) {
692 return cmp;
694 return line1->secondaryCmp(line2);
697 void TextLine::coalesce(UnicodeMap *uMap) {
698 TextWord *word0, *word1;
699 double space, delta, minSpace;
700 GBool isUnicode;
701 char buf[8];
702 int i, j;
704 if (words->next) {
706 // compute the inter-word space threshold
707 if (words->len > 1 || words->next->len > 1) {
708 minSpace = 0;
709 } else {
710 minSpace = words->primaryDelta(words->next);
711 for (word0 = words->next, word1 = word0->next;
712 word1 && minSpace > 0;
713 word0 = word1, word1 = word0->next) {
714 if (word1->len > 1) {
715 minSpace = 0;
717 delta = word0->primaryDelta(word1);
718 if (delta < minSpace) {
719 minSpace = delta;
723 if (minSpace <= 0) {
724 space = maxCharSpacing * words->fontSize;
725 } else {
726 space = maxWideCharSpacingMul * minSpace;
727 if (space > maxWideCharSpacing * words->fontSize) {
728 space = maxWideCharSpacing * words->fontSize;
732 // merge words
733 word0 = words;
734 word1 = words->next;
735 while (word1) {
736 if (word0->primaryDelta(word1) >= space) {
737 word0->spaceAfter = gTrue;
738 word0 = word1;
739 word1 = word1->next;
740 } else if (word0->font == word1->font &&
741 word0->underlined == word1->underlined &&
742 fabs(word0->fontSize - word1->fontSize) <
743 maxWordFontSizeDelta * words->fontSize &&
744 word1->charPos == word0->charPos + word0->charLen) {
745 word0->merge(word1);
746 word0->next = word1->next;
747 delete word1;
748 word1 = word0->next;
749 } else {
750 word0 = word1;
751 word1 = word1->next;
756 // build the line text
757 isUnicode = uMap ? uMap->isUnicode() : gFalse;
758 len = 0;
759 for (word1 = words; word1; word1 = word1->next) {
760 len += word1->len;
761 if (word1->spaceAfter) {
762 ++len;
765 text = (Unicode *)gmallocn(len, sizeof(Unicode));
766 edge = (double *)gmallocn(len + 1, sizeof(double));
767 i = 0;
768 for (word1 = words; word1; word1 = word1->next) {
769 for (j = 0; j < word1->len; ++j) {
770 text[i] = word1->text[j];
771 edge[i] = word1->edge[j];
772 ++i;
774 edge[i] = word1->edge[word1->len];
775 if (word1->spaceAfter) {
776 text[i] = (Unicode)0x0020;
777 ++i;
781 // compute convertedLen and set up the col array
782 col = (int *)gmallocn(len + 1, sizeof(int));
783 convertedLen = 0;
784 for (i = 0; i < len; ++i) {
785 col[i] = convertedLen;
786 if (isUnicode) {
787 ++convertedLen;
788 } else if (uMap) {
789 convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
792 col[len] = convertedLen;
794 // check for hyphen at end of line
795 //~ need to check for other chars used as hyphens
796 hyphenated = text[len - 1] == (Unicode)'-';
799 //------------------------------------------------------------------------
800 // TextLineFrag
801 //------------------------------------------------------------------------
803 class TextLineFrag {
804 public:
806 TextLine *line; // the line xObject
807 int start, len; // offset and length of this fragment
808 // (in Unicode chars)
809 double xMin, xMax; // bounding box coordinates
810 double yMin, yMax;
811 double base; // baseline virtual coordinate
812 int col; // first column
814 void init(TextLine *lineA, int startA, int lenA);
815 void computeCoords(GBool oneRot);
817 static int cmpYXPrimaryRot(const void *p1, const void *p2);
818 static int cmpYXLineRot(const void *p1, const void *p2);
819 static int cmpXYLineRot(const void *p1, const void *p2);
820 static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
821 static int cmpXYColumnLineRot(const void *p1, const void *p2);
824 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
825 line = lineA;
826 start = startA;
827 len = lenA;
828 col = line->col[start];
831 void TextLineFrag::computeCoords(GBool oneRot) {
832 TextBlock *blk;
833 double d0, d1, d2, d3, d4;
835 if (oneRot) {
837 switch (line->rot) {
838 case 0:
839 xMin = line->edge[start];
840 xMax = line->edge[start + len];
841 yMin = line->yMin;
842 yMax = line->yMax;
843 break;
844 case 1:
845 xMin = line->xMin;
846 xMax = line->xMax;
847 yMin = line->edge[start];
848 yMax = line->edge[start + len];
849 break;
850 case 2:
851 xMin = line->edge[start + len];
852 xMax = line->edge[start];
853 yMin = line->yMin;
854 yMax = line->yMax;
855 break;
856 case 3:
857 xMin = line->xMin;
858 xMax = line->xMax;
859 yMin = line->edge[start + len];
860 yMax = line->edge[start];
861 break;
863 base = line->base;
865 } else {
867 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
869 xMin = line->edge[start];
870 xMax = line->edge[start + len];
871 yMin = line->yMin;
872 yMax = line->yMax;
873 base = line->base;
875 } else {
877 blk = line->blk;
878 d0 = line->edge[start];
879 d1 = line->edge[start + len];
880 d2 = d3 = d4 = 0; // make gcc happy
882 switch (line->rot) {
883 case 0:
884 d2 = line->yMin;
885 d3 = line->yMax;
886 d4 = line->base;
887 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
888 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
889 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
890 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
891 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
892 break;
893 case 1:
894 d2 = line->xMax;
895 d3 = line->xMin;
896 d4 = line->base;
897 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
898 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
899 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
900 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
901 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
902 break;
903 case 2:
904 d2 = line->yMax;
905 d3 = line->yMin;
906 d4 = line->base;
907 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
908 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
909 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
910 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
911 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
912 break;
913 case 3:
914 d2 = line->xMin;
915 d3 = line->xMax;
916 d4 = line->base;
917 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
918 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
919 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
920 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
921 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
922 break;
925 switch (line->blk->page->primaryRot) {
926 case 0:
927 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
928 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
929 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
930 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
931 base = blk->yMin + base * (blk->yMax - blk->yMin);
932 break;
933 case 1:
934 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
935 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
936 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
937 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
938 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
939 break;
940 case 2:
941 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
942 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
943 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
944 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
945 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
946 break;
947 case 3:
948 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
949 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
950 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
951 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
952 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
953 break;
960 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
961 TextLineFrag *frag1 = (TextLineFrag *)p1;
962 TextLineFrag *frag2 = (TextLineFrag *)p2;
963 double cmp;
965 cmp = 0; // make gcc happy
966 switch (frag1->line->blk->page->primaryRot) {
967 case 0:
968 if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
969 cmp = frag1->xMin - frag2->xMin;
971 break;
972 case 1:
973 if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
974 cmp = frag1->yMin - frag2->yMin;
976 break;
977 case 2:
978 if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
979 cmp = frag2->xMax - frag1->xMax;
981 break;
982 case 3:
983 if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
984 cmp = frag2->yMax - frag1->yMax;
986 break;
988 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
991 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
992 TextLineFrag *frag1 = (TextLineFrag *)p1;
993 TextLineFrag *frag2 = (TextLineFrag *)p2;
994 double cmp;
996 cmp = 0; // make gcc happy
997 switch (frag1->line->rot) {
998 case 0:
999 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1000 cmp = frag1->xMin - frag2->xMin;
1002 break;
1003 case 1:
1004 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1005 cmp = frag1->yMin - frag2->yMin;
1007 break;
1008 case 2:
1009 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1010 cmp = frag2->xMax - frag1->xMax;
1012 break;
1013 case 3:
1014 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1015 cmp = frag2->yMax - frag1->yMax;
1017 break;
1019 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1022 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
1023 TextLineFrag *frag1 = (TextLineFrag *)p1;
1024 TextLineFrag *frag2 = (TextLineFrag *)p2;
1025 double cmp;
1027 cmp = 0; // make gcc happy
1028 switch (frag1->line->rot) {
1029 case 0:
1030 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1031 cmp = frag1->yMin - frag2->yMin;
1033 break;
1034 case 1:
1035 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1036 cmp = frag2->xMax - frag1->xMax;
1038 break;
1039 case 2:
1040 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1041 cmp = frag2->yMin - frag1->yMin;
1043 break;
1044 case 3:
1045 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1046 cmp = frag1->xMax - frag2->xMax;
1048 break;
1050 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1053 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) {
1054 TextLineFrag *frag1 = (TextLineFrag *)p1;
1055 TextLineFrag *frag2 = (TextLineFrag *)p2;
1056 double cmp;
1058 // if columns overlap, compare y values
1059 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1060 frag2->line->col[frag2->start]) &&
1061 frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1062 frag1->line->col[frag1->start])) {
1063 cmp = 0; // make gcc happy
1064 switch (frag1->line->blk->page->primaryRot) {
1065 case 0: cmp = frag1->yMin - frag2->yMin; break;
1066 case 1: cmp = frag2->xMax - frag1->xMax; break;
1067 case 2: cmp = frag2->yMin - frag1->yMin; break;
1068 case 3: cmp = frag1->xMax - frag2->xMax; break;
1070 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1073 // otherwise, compare starting column
1074 return frag1->col - frag2->col;
1077 int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) {
1078 TextLineFrag *frag1 = (TextLineFrag *)p1;
1079 TextLineFrag *frag2 = (TextLineFrag *)p2;
1080 double cmp;
1082 // if columns overlap, compare y values
1083 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1084 frag2->line->col[frag2->start]) &&
1085 frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1086 frag1->line->col[frag1->start])) {
1087 cmp = 0; // make gcc happy
1088 switch (frag1->line->rot) {
1089 case 0: cmp = frag1->yMin - frag2->yMin; break;
1090 case 1: cmp = frag2->xMax - frag1->xMax; break;
1091 case 2: cmp = frag2->yMin - frag1->yMin; break;
1092 case 3: cmp = frag1->xMax - frag2->xMax; break;
1094 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1097 // otherwise, compare starting column
1098 return frag1->col - frag2->col;
1101 //------------------------------------------------------------------------
1102 // TextBlock
1103 //------------------------------------------------------------------------
1105 TextBlock::TextBlock(TextPage *pageA, int rotA) {
1106 page = pageA;
1107 rot = rotA;
1108 xMin = yMin = 0;
1109 xMax = yMax = -1;
1110 priMin = 0;
1111 priMax = page->pageWidth;
1112 pool = new TextPool();
1113 lines = NULL;
1114 curLine = NULL;
1115 next = NULL;
1116 stackNext = NULL;
1119 TextBlock::~TextBlock() {
1120 TextLine *line;
1122 delete pool;
1123 while (lines) {
1124 line = lines;
1125 lines = lines->next;
1126 delete line;
1130 void TextBlock::addWord(TextWord *word) {
1131 pool->addWord(word);
1132 if (xMin > xMax) {
1133 xMin = word->xMin;
1134 xMax = word->xMax;
1135 yMin = word->yMin;
1136 yMax = word->yMax;
1137 } else {
1138 if (word->xMin < xMin) {
1139 xMin = word->xMin;
1141 if (word->xMax > xMax) {
1142 xMax = word->xMax;
1144 if (word->yMin < yMin) {
1145 yMin = word->yMin;
1147 if (word->yMax > yMax) {
1148 yMax = word->yMax;
1153 void TextBlock::coalesce(UnicodeMap *uMap) {
1154 TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1155 TextLine *line, *line0, *line1;
1156 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1157 int baseIdx, bestWordBaseIdx, idx0, idx1;
1158 double minBase, maxBase;
1159 double fontSize, delta, priDelta, secDelta;
1160 TextLine **lineArray;
1161 GBool found;
1162 int col1, col2;
1163 int i, j, k;
1165 // discard duplicated text (fake boldface, drop shadows)
1166 for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1167 word0 = pool->getPool(idx0);
1168 while (word0) {
1169 priDelta = dupMaxPriDelta * word0->fontSize;
1170 secDelta = dupMaxSecDelta * word0->fontSize;
1171 if (rot == 0 || rot == 3) {
1172 maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1173 } else {
1174 maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1176 found = gFalse;
1177 word1 = word2 = NULL; // make gcc happy
1178 for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1179 if (idx1 == idx0) {
1180 word1 = word0;
1181 word2 = word0->next;
1182 } else {
1183 word1 = NULL;
1184 word2 = pool->getPool(idx1);
1186 for (; word2; word1 = word2, word2 = word2->next) {
1187 if (word2->len == word0->len &&
1188 !memcmp(word2->text, word0->text,
1189 word0->len * sizeof(Unicode))) {
1190 switch (rot) {
1191 case 0:
1192 case 2:
1193 found = fabs(word0->xMin - word2->xMin) < priDelta &&
1194 fabs(word0->xMax - word2->xMax) < priDelta &&
1195 fabs(word0->yMin - word2->yMin) < secDelta &&
1196 fabs(word0->yMax - word2->yMax) < secDelta;
1197 break;
1198 case 1:
1199 case 3:
1200 found = fabs(word0->xMin - word2->xMin) < secDelta &&
1201 fabs(word0->xMax - word2->xMax) < secDelta &&
1202 fabs(word0->yMin - word2->yMin) < priDelta &&
1203 fabs(word0->yMax - word2->yMax) < priDelta;
1204 break;
1207 if (found) {
1208 break;
1211 if (found) {
1212 break;
1215 if (found) {
1216 if (word1) {
1217 word1->next = word2->next;
1218 } else {
1219 pool->setPool(idx1, word2->next);
1221 delete word2;
1222 } else {
1223 word0 = word0->next;
1228 // build the lines
1229 curLine = NULL;
1230 poolMinBaseIdx = pool->minBaseIdx;
1231 charCount = 0;
1232 nLines = 0;
1233 while (1) {
1235 // find the first non-empty line in the pool
1236 for (;
1237 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1238 ++poolMinBaseIdx) ;
1239 if (poolMinBaseIdx > pool->maxBaseIdx) {
1240 break;
1243 // look for the left-most word in the first four lines of the
1244 // pool -- this avoids starting with a superscript word
1245 startBaseIdx = poolMinBaseIdx;
1246 for (baseIdx = poolMinBaseIdx + 1;
1247 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1248 ++baseIdx) {
1249 if (!pool->getPool(baseIdx)) {
1250 continue;
1252 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1253 < 0) {
1254 startBaseIdx = baseIdx;
1258 // create a new line
1259 word0 = pool->getPool(startBaseIdx);
1260 pool->setPool(startBaseIdx, word0->next);
1261 word0->next = NULL;
1262 line = new TextLine(this, word0->rot, word0->base);
1263 line->addWord(word0);
1264 lastWord = word0;
1266 // compute the search range
1267 fontSize = word0->fontSize;
1268 minBase = word0->base - maxIntraLineDelta * fontSize;
1269 maxBase = word0->base + maxIntraLineDelta * fontSize;
1270 minBaseIdx = pool->getBaseIdx(minBase);
1271 maxBaseIdx = pool->getBaseIdx(maxBase);
1273 // find the rest of the words in this line
1274 while (1) {
1276 // find the left-most word whose baseline is in the range for
1277 // this line
1278 bestWordBaseIdx = 0;
1279 bestWord0 = bestWord1 = NULL;
1280 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1281 for (word0 = NULL, word1 = pool->getPool(baseIdx);
1282 word1;
1283 word0 = word1, word1 = word1->next) {
1284 if (word1->base >= minBase &&
1285 word1->base <= maxBase &&
1286 (delta = lastWord->primaryDelta(word1)) >=
1287 minCharSpacing * fontSize) {
1288 if (delta < maxWordSpacing * fontSize &&
1289 (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1290 bestWordBaseIdx = baseIdx;
1291 bestWord0 = word0;
1292 bestWord1 = word1;
1294 break;
1298 if (!bestWord1) {
1299 break;
1302 // remove it from the pool, and add it to the line
1303 if (bestWord0) {
1304 bestWord0->next = bestWord1->next;
1305 } else {
1306 pool->setPool(bestWordBaseIdx, bestWord1->next);
1308 bestWord1->next = NULL;
1309 line->addWord(bestWord1);
1310 lastWord = bestWord1;
1313 // add the line
1314 if (curLine && line->cmpYX(curLine) > 0) {
1315 line0 = curLine;
1316 line1 = curLine->next;
1317 } else {
1318 line0 = NULL;
1319 line1 = lines;
1321 for (;
1322 line1 && line->cmpYX(line1) > 0;
1323 line0 = line1, line1 = line1->next) ;
1324 if (line0) {
1325 line0->next = line;
1326 } else {
1327 lines = line;
1329 line->next = line1;
1330 curLine = line;
1331 line->coalesce(uMap);
1332 charCount += line->len;
1333 ++nLines;
1336 // sort lines into xy order for column assignment
1337 lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1338 for (line = lines, i = 0; line; line = line->next, ++i) {
1339 lineArray[i] = line;
1341 qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1343 // column assignment
1344 nColumns = 0;
1345 for (i = 0; i < nLines; ++i) {
1346 line0 = lineArray[i];
1347 col1 = 0;
1348 for (j = 0; j < i; ++j) {
1349 line1 = lineArray[j];
1350 if (line1->primaryDelta(line0) >= 0) {
1351 col2 = line1->col[line1->len] + 1;
1352 } else {
1353 k = 0; // make gcc happy
1354 switch (rot) {
1355 case 0:
1356 for (k = 0;
1357 k < line1->len &&
1358 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1359 ++k) ;
1360 break;
1361 case 1:
1362 for (k = 0;
1363 k < line1->len &&
1364 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1365 ++k) ;
1366 break;
1367 case 2:
1368 for (k = 0;
1369 k < line1->len &&
1370 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1371 ++k) ;
1372 break;
1373 case 3:
1374 for (k = 0;
1375 k < line1->len &&
1376 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1377 ++k) ;
1378 break;
1380 col2 = line1->col[k];
1382 if (col2 > col1) {
1383 col1 = col2;
1386 for (k = 0; k <= line0->len; ++k) {
1387 line0->col[k] += col1;
1389 if (line0->col[line0->len] > nColumns) {
1390 nColumns = line0->col[line0->len];
1393 gfree(lineArray);
1396 void TextBlock::updatePriMinMax(TextBlock *blk) {
1397 double newPriMin, newPriMax;
1398 GBool gotPriMin, gotPriMax;
1400 gotPriMin = gotPriMax = gFalse;
1401 newPriMin = newPriMax = 0; // make gcc happy
1402 switch (page->primaryRot) {
1403 case 0:
1404 case 2:
1405 if (blk->yMin < yMax && blk->yMax > yMin) {
1406 if (blk->xMin < xMin) {
1407 newPriMin = blk->xMax;
1408 gotPriMin = gTrue;
1410 if (blk->xMax > xMax) {
1411 newPriMax = blk->xMin;
1412 gotPriMax = gTrue;
1415 break;
1416 case 1:
1417 case 3:
1418 if (blk->xMin < xMax && blk->xMax > xMin) {
1419 if (blk->yMin < yMin) {
1420 newPriMin = blk->yMax;
1421 gotPriMin = gTrue;
1423 if (blk->yMax > yMax) {
1424 newPriMax = blk->yMin;
1425 gotPriMax = gTrue;
1428 break;
1430 if (gotPriMin) {
1431 if (newPriMin > xMin) {
1432 newPriMin = xMin;
1434 if (newPriMin > priMin) {
1435 priMin = newPriMin;
1438 if (gotPriMax) {
1439 if (newPriMax < xMax) {
1440 newPriMax = xMax;
1442 if (newPriMax < priMax) {
1443 priMax = newPriMax;
1448 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1449 TextBlock *blk1 = *(TextBlock **)p1;
1450 TextBlock *blk2 = *(TextBlock **)p2;
1451 double cmp;
1453 cmp = 0; // make gcc happy
1454 switch (blk1->page->primaryRot) {
1455 case 0:
1456 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1457 cmp = blk1->yMin - blk2->yMin;
1459 break;
1460 case 1:
1461 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1462 cmp = blk2->xMax - blk1->xMax;
1464 break;
1465 case 2:
1466 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1467 cmp = blk2->yMin - blk1->yMin;
1469 break;
1470 case 3:
1471 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1472 cmp = blk1->xMax - blk2->xMax;
1474 break;
1476 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1479 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1480 TextBlock *blk1 = *(TextBlock **)p1;
1481 TextBlock *blk2 = *(TextBlock **)p2;
1482 double cmp;
1484 cmp = 0; // make gcc happy
1485 switch (blk1->page->primaryRot) {
1486 case 0:
1487 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1488 cmp = blk1->xMin - blk2->xMin;
1490 break;
1491 case 1:
1492 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1493 cmp = blk1->yMin - blk2->yMin;
1495 break;
1496 case 2:
1497 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1498 cmp = blk2->xMax - blk1->xMax;
1500 break;
1501 case 3:
1502 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1503 cmp = blk2->yMax - blk1->yMax;
1505 break;
1507 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1510 int TextBlock::primaryCmp(TextBlock *blk) {
1511 double cmp;
1513 cmp = 0; // make gcc happy
1514 switch (rot) {
1515 case 0:
1516 cmp = xMin - blk->xMin;
1517 break;
1518 case 1:
1519 cmp = yMin - blk->yMin;
1520 break;
1521 case 2:
1522 cmp = blk->xMax - xMax;
1523 break;
1524 case 3:
1525 cmp = blk->yMax - yMax;
1526 break;
1528 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1531 double TextBlock::secondaryDelta(TextBlock *blk) {
1532 double delta;
1534 delta = 0; // make gcc happy
1535 switch (rot) {
1536 case 0:
1537 delta = blk->yMin - yMax;
1538 break;
1539 case 1:
1540 delta = xMin - blk->xMax;
1541 break;
1542 case 2:
1543 delta = yMin - blk->yMax;
1544 break;
1545 case 3:
1546 delta = blk->xMin - xMax;
1547 break;
1549 return delta;
1552 GBool TextBlock::isBelow(TextBlock *blk) {
1553 GBool below;
1555 below = gFalse; // make gcc happy
1556 switch (page->primaryRot) {
1557 case 0:
1558 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1559 yMin > blk->yMin;
1560 break;
1561 case 1:
1562 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1563 xMax < blk->xMax;
1564 break;
1565 case 2:
1566 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1567 yMax < blk->yMax;
1568 break;
1569 case 3:
1570 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1571 xMin > blk->xMin;
1572 break;
1575 return below;
1578 //------------------------------------------------------------------------
1579 // TextFlow
1580 //------------------------------------------------------------------------
1582 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1583 page = pageA;
1584 xMin = blk->xMin;
1585 xMax = blk->xMax;
1586 yMin = blk->yMin;
1587 yMax = blk->yMax;
1588 priMin = blk->priMin;
1589 priMax = blk->priMax;
1590 blocks = lastBlk = blk;
1591 next = NULL;
1594 TextFlow::~TextFlow() {
1595 TextBlock *blk;
1597 while (blocks) {
1598 blk = blocks;
1599 blocks = blocks->next;
1600 delete blk;
1604 void TextFlow::addBlock(TextBlock *blk) {
1605 if (lastBlk) {
1606 lastBlk->next = blk;
1607 } else {
1608 blocks = blk;
1610 lastBlk = blk;
1611 if (blk->xMin < xMin) {
1612 xMin = blk->xMin;
1614 if (blk->xMax > xMax) {
1615 xMax = blk->xMax;
1617 if (blk->yMin < yMin) {
1618 yMin = blk->yMin;
1620 if (blk->yMax > yMax) {
1621 yMax = blk->yMax;
1625 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1626 GBool fits;
1628 // lower blocks must use smaller fonts
1629 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1630 return gFalse;
1633 fits = gFalse; // make gcc happy
1634 switch (page->primaryRot) {
1635 case 0:
1636 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1637 break;
1638 case 1:
1639 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1640 break;
1641 case 2:
1642 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1643 break;
1644 case 3:
1645 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1646 break;
1648 return fits;
1651 #if TEXTOUT_WORD_LIST
1653 //------------------------------------------------------------------------
1654 // TextWordList
1655 //------------------------------------------------------------------------
1657 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1658 TextFlow *flow;
1659 TextBlock *blk;
1660 TextLine *line;
1661 TextWord *word;
1662 TextWord **wordArray;
1663 int nWords, i;
1665 words = new GList();
1667 if (text->rawOrder) {
1668 for (word = text->rawWords; word; word = word->next) {
1669 words->append(word);
1672 } else if (physLayout) {
1673 // this is inefficient, but it's also the least useful of these
1674 // three cases
1675 nWords = 0;
1676 for (flow = text->flows; flow; flow = flow->next) {
1677 for (blk = flow->blocks; blk; blk = blk->next) {
1678 for (line = blk->lines; line; line = line->next) {
1679 for (word = line->words; word; word = word->next) {
1680 ++nWords;
1685 wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
1686 i = 0;
1687 for (flow = text->flows; flow; flow = flow->next) {
1688 for (blk = flow->blocks; blk; blk = blk->next) {
1689 for (line = blk->lines; line; line = line->next) {
1690 for (word = line->words; word; word = word->next) {
1691 wordArray[i++] = word;
1696 qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1697 for (i = 0; i < nWords; ++i) {
1698 words->append(wordArray[i]);
1700 gfree(wordArray);
1702 } else {
1703 for (flow = text->flows; flow; flow = flow->next) {
1704 for (blk = flow->blocks; blk; blk = blk->next) {
1705 for (line = blk->lines; line; line = line->next) {
1706 for (word = line->words; word; word = word->next) {
1707 words->append(word);
1715 TextWordList::~TextWordList() {
1716 delete words;
1719 int TextWordList::getLength() {
1720 return words->getLength();
1723 TextWord *TextWordList::get(int idx) {
1724 if (idx < 0 || idx >= words->getLength()) {
1725 return NULL;
1727 return (TextWord *)words->get(idx);
1730 #endif // TEXTOUT_WORD_LIST
1732 //------------------------------------------------------------------------
1733 // TextPage
1734 //------------------------------------------------------------------------
1736 TextPage::TextPage(GBool rawOrderA) {
1737 int rot;
1739 rawOrder = rawOrderA;
1740 curWord = NULL;
1741 charPos = 0;
1742 curFont = NULL;
1743 curFontSize = 0;
1744 nest = 0;
1745 nTinyChars = 0;
1746 lastCharOverlap = gFalse;
1747 if (!rawOrder) {
1748 for (rot = 0; rot < 4; ++rot) {
1749 pools[rot] = new TextPool();
1752 flows = NULL;
1753 blocks = NULL;
1754 rawWords = NULL;
1755 rawLastWord = NULL;
1756 fonts = new GList();
1757 lastFindXMin = lastFindYMin = 0;
1758 haveLastFind = gFalse;
1759 underlines = new GList();
1760 links = new GList();
1763 TextPage::~TextPage() {
1764 int rot;
1766 clear();
1767 if (!rawOrder) {
1768 for (rot = 0; rot < 4; ++rot) {
1769 delete pools[rot];
1772 delete fonts;
1773 deleteGList(underlines, TextUnderline);
1774 deleteGList(links, TextLink);
1777 void TextPage::startPage(GfxState *state) {
1778 clear();
1779 if (state) {
1780 pageWidth = state->getPageWidth();
1781 pageHeight = state->getPageHeight();
1782 } else {
1783 pageWidth = pageHeight = 0;
1787 void TextPage::endPage() {
1788 if (curWord) {
1789 endWord();
1793 void TextPage::clear() {
1794 int rot;
1795 TextFlow *flow;
1796 TextWord *word;
1798 if (curWord) {
1799 delete curWord;
1800 curWord = NULL;
1802 if (rawOrder) {
1803 while (rawWords) {
1804 word = rawWords;
1805 rawWords = rawWords->next;
1806 delete word;
1808 } else {
1809 for (rot = 0; rot < 4; ++rot) {
1810 delete pools[rot];
1812 while (flows) {
1813 flow = flows;
1814 flows = flows->next;
1815 delete flow;
1817 gfree(blocks);
1819 deleteGList(fonts, TextFontInfo);
1821 curWord = NULL;
1822 charPos = 0;
1823 curFont = NULL;
1824 curFontSize = 0;
1825 nest = 0;
1826 nTinyChars = 0;
1827 if (!rawOrder) {
1828 for (rot = 0; rot < 4; ++rot) {
1829 pools[rot] = new TextPool();
1832 flows = NULL;
1833 blocks = NULL;
1834 rawWords = NULL;
1835 rawLastWord = NULL;
1836 fonts = new GList();
1839 void TextPage::updateFont(GfxState *state) {
1840 GfxFont *gfxFont;
1841 double *fm;
1842 char *name;
1843 int code, mCode, letterCode, anyCode;
1844 double w;
1845 int i;
1847 // get the font info xObject
1848 curFont = NULL;
1849 for (i = 0; i < fonts->getLength(); ++i) {
1850 curFont = (TextFontInfo *)fonts->get(i);
1851 if (curFont->matches(state)) {
1852 break;
1854 curFont = NULL;
1856 if (!curFont) {
1857 curFont = new TextFontInfo(state);
1858 fonts->append(curFont);
1861 // adjust the font size
1862 gfxFont = state->getFont();
1863 curFontSize = state->getTransformedFontSize();
1864 if (gfxFont && gfxFont->getType() == fontType3) {
1865 // This is a hack which makes it possible to deal with some Type 3
1866 // fonts. The problem is that it's impossible to know what the
1867 // base coordinate system used in the font is without actually
1868 // rendering the font. This code tries to guess by looking at the
1869 // width of the character 'm' (which breaks if the font is a
1870 // subset that doesn't contain 'm').
1871 mCode = letterCode = anyCode = -1;
1872 for (code = 0; code < 256; ++code) {
1873 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1874 if (name && name[0] == 'm' && name[1] == '\0') {
1875 mCode = code;
1877 if (letterCode < 0 && name && name[1] == '\0' &&
1878 ((name[0] >= 'A' && name[0] <= 'Z') ||
1879 (name[0] >= 'a' && name[0] <= 'z'))) {
1880 letterCode = code;
1882 if (anyCode < 0 && name &&
1883 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1884 anyCode = code;
1887 if (mCode >= 0 &&
1888 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1889 // 0.6 is a generic average 'm' width -- yes, this is a hack
1890 curFontSize *= w / 0.6;
1891 } else if (letterCode >= 0 &&
1892 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1893 // even more of a hack: 0.5 is a generic letter width
1894 curFontSize *= w / 0.5;
1895 } else if (anyCode >= 0 &&
1896 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1897 // better than nothing: 0.5 is a generic character width
1898 curFontSize *= w / 0.5;
1900 fm = gfxFont->getFontMatrix();
1901 if (fm[0] != 0) {
1902 curFontSize *= fabs(fm[3] / fm[0]);
1907 void TextPage::beginWord(GfxState *state, double x0, double y0) {
1908 double *fontm;
1909 double m[4], m2[4];
1910 int rot;
1912 // This check is needed because Type 3 characters can contain
1913 // text-drawing operations (when TextPage is being used via
1914 // {X,Win}SplashOutputDev rather than TextOutputDev).
1915 if (curWord) {
1916 ++nest;
1917 return;
1920 // compute the rotation
1921 state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1922 if (state->getFont()->getType() == fontType3) {
1923 fontm = state->getFont()->getFontMatrix();
1924 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
1925 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
1926 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
1927 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
1928 m[0] = m2[0];
1929 m[1] = m2[1];
1930 m[2] = m2[2];
1931 m[3] = m2[3];
1933 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1934 rot = (m[3] < 0) ? 0 : 2;
1935 } else {
1936 rot = (m[2] > 0) ? 1 : 3;
1939 curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
1942 void TextPage::addChar(GfxState *state, double x, double y,
1943 double dx, double dy,
1944 CharCode c, int nBytes, Unicode *u, int uLen) {
1945 double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
1946 GBool overlap;
1947 int i;
1949 // subtract char and word spacing from the dx,dy values
1950 sp = state->getCharSpace();
1951 if (c == (CharCode)0x20) {
1952 sp += state->getWordSpace();
1954 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1955 dx -= dx2;
1956 dy -= dy2;
1957 state->transformDelta(dx, dy, &w1, &h1);
1959 // throw away chars that aren't inside the page bounds
1960 // (and also do a sanity check on the character size)
1961 state->transform(x, y, &x1, &y1);
1962 if (x1 + w1 < 0 || x1 > pageWidth ||
1963 y1 + h1 < 0 || y1 > pageHeight ||
1964 w1 > pageWidth || h1 > pageHeight) {
1965 charPos += nBytes;
1966 return;
1969 // check the tiny chars limit
1970 if (!globalParams->getTextKeepTinyChars() &&
1971 fabs(w1) < 3 && fabs(h1) < 3) {
1972 if (++nTinyChars > 50000) {
1973 charPos += nBytes;
1974 return;
1978 // break words at space character
1979 if (uLen == 1 && u[0] == (Unicode)0x20) {
1980 if (curWord) {
1981 ++curWord->charLen;
1983 charPos += nBytes;
1984 endWord();
1985 return;
1988 // start a new word if:
1989 // (1) this character doesn't fall in the right place relative to
1990 // the end of the previous word (this places upper and lower
1991 // constraints on the position deltas along both the primary
1992 // and secondary axes), or
1993 // (2) this character overlaps the previous one (duplicated text), or
1994 // (3) the previous character was an overlap (we want each duplicated
1995 // character to be in a word by itself at this stage),
1996 // (4) the font size has changed
1997 if (curWord && curWord->len > 0) {
1998 base = sp = delta = 0; // make gcc happy
1999 switch (curWord->rot) {
2000 case 0:
2001 base = y1;
2002 sp = x1 - curWord->xMax;
2003 delta = x1 - curWord->edge[curWord->len - 1];
2004 break;
2005 case 1:
2006 base = x1;
2007 sp = y1 - curWord->yMax;
2008 delta = y1 - curWord->edge[curWord->len - 1];
2009 break;
2010 case 2:
2011 base = y1;
2012 sp = curWord->xMin - x1;
2013 delta = curWord->edge[curWord->len - 1] - x1;
2014 break;
2015 case 3:
2016 base = x1;
2017 sp = curWord->yMin - y1;
2018 delta = curWord->edge[curWord->len - 1] - y1;
2019 break;
2021 overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
2022 fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2023 if (overlap || lastCharOverlap ||
2024 sp < -minDupBreakOverlap * curWord->fontSize ||
2025 sp > minWordBreakSpace * curWord->fontSize ||
2026 fabs(base - curWord->base) > 0.5 ||
2027 curFontSize != curWord->fontSize) {
2028 endWord();
2030 lastCharOverlap = overlap;
2031 } else {
2032 lastCharOverlap = gFalse;
2035 if (uLen != 0) {
2036 // start a new word if needed
2037 if (!curWord) {
2038 beginWord(state, x, y);
2041 // page rotation and/or transform matrices can cause text to be
2042 // drawn in reverse order -- in this case, swap the begin/end
2043 // coordinates and break text into individual chars
2044 if ((curWord->rot == 0 && w1 < 0) ||
2045 (curWord->rot == 1 && h1 < 0) ||
2046 (curWord->rot == 2 && w1 > 0) ||
2047 (curWord->rot == 3 && h1 > 0)) {
2048 endWord();
2049 beginWord(state, x + dx, y + dy);
2050 x1 += w1;
2051 y1 += h1;
2052 w1 = -w1;
2053 h1 = -h1;
2056 // add the characters to the current word
2057 w1 /= uLen;
2058 h1 /= uLen;
2059 for (i = 0; i < uLen; ++i) {
2060 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
2063 if (curWord) {
2064 curWord->charLen += nBytes;
2066 charPos += nBytes;
2069 void TextPage::endWord() {
2070 // This check is needed because Type 3 characters can contain
2071 // text-drawing operations (when TextPage is being used via
2072 // {X,Win}SplashOutputDev rather than TextOutputDev).
2073 if (nest > 0) {
2074 --nest;
2075 return;
2078 if (curWord) {
2079 addWord(curWord);
2080 curWord = NULL;
2084 void TextPage::addWord(TextWord *word) {
2085 // throw away zero-length words -- they don't have valid xMin/xMax
2086 // values, and they're useless anyway
2087 if (word->len == 0) {
2088 delete word;
2089 return;
2092 if (rawOrder) {
2093 if (rawLastWord) {
2094 rawLastWord->next = word;
2095 } else {
2096 rawWords = word;
2098 rawLastWord = word;
2099 } else {
2100 pools[word->rot]->addWord(word);
2104 void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
2105 underlines->append(new TextUnderline(x0, y0, x1, y1));
2108 void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) {
2109 links->append(new TextLink(xMin, yMin, xMax, yMax, link));
2112 void TextPage::coalesce(GBool physLayout, GBool doHTML) {
2113 UnicodeMap *uMap;
2114 TextPool *pool;
2115 TextWord *word0, *word1, *word2;
2116 TextLine *line;
2117 TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
2118 TextBlock **blkArray;
2119 TextFlow *flow, *lastFlow;
2120 TextUnderline *underline;
2121 TextLink *link;
2122 int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2123 double minBase, maxBase, newMinBase, newMaxBase;
2124 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2125 GBool found;
2126 int count[4];
2127 int lrCount;
2128 int firstBlkIdx, nBlocksLeft;
2129 int col1, col2;
2130 int i, j, n;
2132 if (rawOrder) {
2133 primaryRot = 0;
2134 primaryLR = gTrue;
2135 return;
2138 uMap = globalParams->getTextEncoding();
2139 blkList = NULL;
2140 lastBlk = NULL;
2141 nBlocks = 0;
2142 primaryRot = -1;
2144 #if 0 // for debugging
2145 printf("*** initial words ***\n");
2146 for (rot = 0; rot < 4; ++rot) {
2147 pool = pools[rot];
2148 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2149 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2150 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2151 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2152 word0->base, word0->fontSize, rot*90, word0->link);
2153 for (i = 0; i < word0->len; ++i) {
2154 fputc(word0->text[i] & 0xff, stdout);
2156 printf("'\n");
2160 printf("\n");
2161 #endif
2163 #if 0 //~ for debugging
2164 for (i = 0; i < underlines->getLength(); ++i) {
2165 underline = (TextUnderline *)underlines->get(i);
2166 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2167 underline->x0, underline->x1, underline->y0, underline->y1,
2168 underline->horiz);
2170 #endif
2172 if (doHTML) {
2174 //----- handle underlining
2175 for (i = 0; i < underlines->getLength(); ++i) {
2176 underline = (TextUnderline *)underlines->get(i);
2177 if (underline->horiz) {
2178 // rot = 0
2179 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2180 startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2181 endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2182 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2183 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2184 //~ need to check the y value against the word baseline
2185 if (underline->x0 < word0->xMin + underlineSlack &&
2186 word0->xMax - underlineSlack < underline->x1) {
2187 word0->underlined = gTrue;
2193 // rot = 2
2194 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2195 startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2196 endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2197 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2198 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2199 if (underline->x0 < word0->xMin + underlineSlack &&
2200 word0->xMax - underlineSlack < underline->x1) {
2201 word0->underlined = gTrue;
2206 } else {
2207 // rot = 1
2208 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2209 startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2210 endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2211 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2212 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2213 if (underline->y0 < word0->yMin + underlineSlack &&
2214 word0->yMax - underlineSlack < underline->y1) {
2215 word0->underlined = gTrue;
2221 // rot = 3
2222 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2223 startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2224 endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2225 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2226 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2227 if (underline->y0 < word0->yMin + underlineSlack &&
2228 word0->yMax - underlineSlack < underline->y1) {
2229 word0->underlined = gTrue;
2237 //----- handle links
2238 for (i = 0; i < links->getLength(); ++i) {
2239 link = (TextLink *)links->get(i);
2241 // rot = 0
2242 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2243 startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2244 endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2245 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2246 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2247 if (link->xMin < word0->xMin + hyperlinkSlack &&
2248 word0->xMax - hyperlinkSlack < link->xMax &&
2249 link->yMin < word0->yMin + hyperlinkSlack &&
2250 word0->yMax - hyperlinkSlack < link->yMax) {
2251 word0->link = link->link;
2257 // rot = 2
2258 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2259 startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2260 endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2261 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2262 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2263 if (link->xMin < word0->xMin + hyperlinkSlack &&
2264 word0->xMax - hyperlinkSlack < link->xMax &&
2265 link->yMin < word0->yMin + hyperlinkSlack &&
2266 word0->yMax - hyperlinkSlack < link->yMax) {
2267 word0->link = link->link;
2273 // rot = 1
2274 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2275 startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2276 endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2277 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2278 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2279 if (link->yMin < word0->yMin + hyperlinkSlack &&
2280 word0->yMax - hyperlinkSlack < link->yMax &&
2281 link->xMin < word0->xMin + hyperlinkSlack &&
2282 word0->xMax - hyperlinkSlack < link->xMax) {
2283 word0->link = link->link;
2289 // rot = 3
2290 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2291 startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2292 endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2293 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2294 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2295 if (link->yMin < word0->yMin + hyperlinkSlack &&
2296 word0->yMax - hyperlinkSlack < link->yMax &&
2297 link->xMin < word0->xMin + hyperlinkSlack &&
2298 word0->xMax - hyperlinkSlack < link->xMax) {
2299 word0->link = link->link;
2307 //----- assemble the blocks
2309 //~ add an outer loop for writing mode (vertical text)
2311 // build blocks for each rotation value
2312 for (rot = 0; rot < 4; ++rot) {
2313 pool = pools[rot];
2314 poolMinBaseIdx = pool->minBaseIdx;
2315 count[rot] = 0;
2317 // add blocks until no more words are left
2318 while (1) {
2320 // find the first non-empty line in the pool
2321 for (;
2322 poolMinBaseIdx <= pool->maxBaseIdx &&
2323 !pool->getPool(poolMinBaseIdx);
2324 ++poolMinBaseIdx) ;
2325 if (poolMinBaseIdx > pool->maxBaseIdx) {
2326 break;
2329 // look for the left-most word in the first four lines of the
2330 // pool -- this avoids starting with a superscript word
2331 startBaseIdx = poolMinBaseIdx;
2332 for (baseIdx = poolMinBaseIdx + 1;
2333 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2334 ++baseIdx) {
2335 if (!pool->getPool(baseIdx)) {
2336 continue;
2338 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2339 < 0) {
2340 startBaseIdx = baseIdx;
2344 // create a new block
2345 word0 = pool->getPool(startBaseIdx);
2346 pool->setPool(startBaseIdx, word0->next);
2347 word0->next = NULL;
2348 blk = new TextBlock(this, rot);
2349 blk->addWord(word0);
2351 fontSize = word0->fontSize;
2352 minBase = maxBase = word0->base;
2353 colSpace1 = minColSpacing1 * fontSize;
2354 colSpace2 = minColSpacing2 * fontSize;
2355 lineSpace = maxLineSpacingDelta * fontSize;
2356 intraLineSpace = maxIntraLineDelta * fontSize;
2358 // add words to the block
2359 do {
2360 found = gFalse;
2362 // look for words on the line above the current top edge of
2363 // the block
2364 newMinBase = minBase;
2365 for (baseIdx = pool->getBaseIdx(minBase);
2366 baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2367 --baseIdx) {
2368 word0 = NULL;
2369 word1 = pool->getPool(baseIdx);
2370 while (word1) {
2371 if (word1->base < minBase &&
2372 word1->base >= minBase - lineSpace &&
2373 ((rot == 0 || rot == 2)
2374 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2375 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2376 fabs(word1->fontSize - fontSize) <
2377 maxBlockFontSizeDelta1 * fontSize) {
2378 word2 = word1;
2379 if (word0) {
2380 word0->next = word1->next;
2381 } else {
2382 pool->setPool(baseIdx, word1->next);
2384 word1 = word1->next;
2385 word2->next = NULL;
2386 blk->addWord(word2);
2387 found = gTrue;
2388 newMinBase = word2->base;
2389 } else {
2390 word0 = word1;
2391 word1 = word1->next;
2395 minBase = newMinBase;
2397 // look for words on the line below the current bottom edge of
2398 // the block
2399 newMaxBase = maxBase;
2400 for (baseIdx = pool->getBaseIdx(maxBase);
2401 baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2402 ++baseIdx) {
2403 word0 = NULL;
2404 word1 = pool->getPool(baseIdx);
2405 while (word1) {
2406 if (word1->base > maxBase &&
2407 word1->base <= maxBase + lineSpace &&
2408 ((rot == 0 || rot == 2)
2409 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2410 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2411 fabs(word1->fontSize - fontSize) <
2412 maxBlockFontSizeDelta1 * fontSize) {
2413 word2 = word1;
2414 if (word0) {
2415 word0->next = word1->next;
2416 } else {
2417 pool->setPool(baseIdx, word1->next);
2419 word1 = word1->next;
2420 word2->next = NULL;
2421 blk->addWord(word2);
2422 found = gTrue;
2423 newMaxBase = word2->base;
2424 } else {
2425 word0 = word1;
2426 word1 = word1->next;
2430 maxBase = newMaxBase;
2432 // look for words that are on lines already in the block, and
2433 // that overlap the block horizontally
2434 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2435 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2436 ++baseIdx) {
2437 word0 = NULL;
2438 word1 = pool->getPool(baseIdx);
2439 while (word1) {
2440 if (word1->base >= minBase - intraLineSpace &&
2441 word1->base <= maxBase + intraLineSpace &&
2442 ((rot == 0 || rot == 2)
2443 ? (word1->xMin < blk->xMax + colSpace1 &&
2444 word1->xMax > blk->xMin - colSpace1)
2445 : (word1->yMin < blk->yMax + colSpace1 &&
2446 word1->yMax > blk->yMin - colSpace1)) &&
2447 fabs(word1->fontSize - fontSize) <
2448 maxBlockFontSizeDelta2 * fontSize) {
2449 word2 = word1;
2450 if (word0) {
2451 word0->next = word1->next;
2452 } else {
2453 pool->setPool(baseIdx, word1->next);
2455 word1 = word1->next;
2456 word2->next = NULL;
2457 blk->addWord(word2);
2458 found = gTrue;
2459 } else {
2460 word0 = word1;
2461 word1 = word1->next;
2466 // only check for outlying words (the next two chunks of code)
2467 // if we didn't find anything else
2468 if (found) {
2469 continue;
2472 // scan down the left side of the block, looking for words
2473 // that are near (but not overlapping) the block; if there are
2474 // three or fewer, add them to the block
2475 n = 0;
2476 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2477 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2478 ++baseIdx) {
2479 word1 = pool->getPool(baseIdx);
2480 while (word1) {
2481 if (word1->base >= minBase - intraLineSpace &&
2482 word1->base <= maxBase + intraLineSpace &&
2483 ((rot == 0 || rot == 2)
2484 ? (word1->xMax <= blk->xMin &&
2485 word1->xMax > blk->xMin - colSpace2)
2486 : (word1->yMax <= blk->yMin &&
2487 word1->yMax > blk->yMin - colSpace2)) &&
2488 fabs(word1->fontSize - fontSize) <
2489 maxBlockFontSizeDelta3 * fontSize) {
2490 ++n;
2491 break;
2493 word1 = word1->next;
2496 if (n > 0 && n <= 3) {
2497 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2498 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2499 ++baseIdx) {
2500 word0 = NULL;
2501 word1 = pool->getPool(baseIdx);
2502 while (word1) {
2503 if (word1->base >= minBase - intraLineSpace &&
2504 word1->base <= maxBase + intraLineSpace &&
2505 ((rot == 0 || rot == 2)
2506 ? (word1->xMax <= blk->xMin &&
2507 word1->xMax > blk->xMin - colSpace2)
2508 : (word1->yMax <= blk->yMin &&
2509 word1->yMax > blk->yMin - colSpace2)) &&
2510 fabs(word1->fontSize - fontSize) <
2511 maxBlockFontSizeDelta3 * fontSize) {
2512 word2 = word1;
2513 if (word0) {
2514 word0->next = word1->next;
2515 } else {
2516 pool->setPool(baseIdx, word1->next);
2518 word1 = word1->next;
2519 word2->next = NULL;
2520 blk->addWord(word2);
2521 if (word2->base < minBase) {
2522 minBase = word2->base;
2523 } else if (word2->base > maxBase) {
2524 maxBase = word2->base;
2526 found = gTrue;
2527 break;
2528 } else {
2529 word0 = word1;
2530 word1 = word1->next;
2536 // scan down the right side of the block, looking for words
2537 // that are near (but not overlapping) the block; if there are
2538 // three or fewer, add them to the block
2539 n = 0;
2540 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2541 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2542 ++baseIdx) {
2543 word1 = pool->getPool(baseIdx);
2544 while (word1) {
2545 if (word1->base >= minBase - intraLineSpace &&
2546 word1->base <= maxBase + intraLineSpace &&
2547 ((rot == 0 || rot == 2)
2548 ? (word1->xMin >= blk->xMax &&
2549 word1->xMin < blk->xMax + colSpace2)
2550 : (word1->yMin >= blk->yMax &&
2551 word1->yMin < blk->yMax + colSpace2)) &&
2552 fabs(word1->fontSize - fontSize) <
2553 maxBlockFontSizeDelta3 * fontSize) {
2554 ++n;
2555 break;
2557 word1 = word1->next;
2560 if (n > 0 && n <= 3) {
2561 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2562 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2563 ++baseIdx) {
2564 word0 = NULL;
2565 word1 = pool->getPool(baseIdx);
2566 while (word1) {
2567 if (word1->base >= minBase - intraLineSpace &&
2568 word1->base <= maxBase + intraLineSpace &&
2569 ((rot == 0 || rot == 2)
2570 ? (word1->xMin >= blk->xMax &&
2571 word1->xMin < blk->xMax + colSpace2)
2572 : (word1->yMin >= blk->yMax &&
2573 word1->yMin < blk->yMax + colSpace2)) &&
2574 fabs(word1->fontSize - fontSize) <
2575 maxBlockFontSizeDelta3 * fontSize) {
2576 word2 = word1;
2577 if (word0) {
2578 word0->next = word1->next;
2579 } else {
2580 pool->setPool(baseIdx, word1->next);
2582 word1 = word1->next;
2583 word2->next = NULL;
2584 blk->addWord(word2);
2585 if (word2->base < minBase) {
2586 minBase = word2->base;
2587 } else if (word2->base > maxBase) {
2588 maxBase = word2->base;
2590 found = gTrue;
2591 break;
2592 } else {
2593 word0 = word1;
2594 word1 = word1->next;
2600 } while (found);
2602 //~ need to compute the primary writing mode (horiz/vert) in
2603 //~ addition to primary rotation
2605 // coalesce the block, and add it to the list
2606 blk->coalesce(uMap);
2607 if (lastBlk) {
2608 lastBlk->next = blk;
2609 } else {
2610 blkList = blk;
2612 lastBlk = blk;
2613 count[rot] += blk->charCount;
2614 if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2615 primaryRot = rot;
2617 ++nBlocks;
2621 #if 0 // for debugging
2622 printf("*** rotation ***\n");
2623 for (rot = 0; rot < 4; ++rot) {
2624 printf(" %d: %6d\n", rot, count[rot]);
2626 printf(" primary rot = %d\n", primaryRot);
2627 printf("\n");
2628 #endif
2630 #if 0 // for debugging
2631 printf("*** blocks ***\n");
2632 for (blk = blkList; blk; blk = blk->next) {
2633 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2634 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2635 for (line = blk->lines; line; line = line->next) {
2636 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2637 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2638 for (word0 = line->words; word0; word0 = word0->next) {
2639 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2640 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2641 word0->base, word0->fontSize, word0->spaceAfter);
2642 for (i = 0; i < word0->len; ++i) {
2643 fputc(word0->text[i] & 0xff, stdout);
2645 printf("'\n");
2649 printf("\n");
2650 #endif
2652 // determine the primary direction
2653 lrCount = 0;
2654 for (blk = blkList; blk; blk = blk->next) {
2655 for (line = blk->lines; line; line = line->next) {
2656 for (word0 = line->words; word0; word0 = word0->next) {
2657 for (i = 0; i < word0->len; ++i) {
2658 if (unicodeTypeL(word0->text[i])) {
2659 ++lrCount;
2660 } else if (unicodeTypeR(word0->text[i])) {
2661 --lrCount;
2667 primaryLR = lrCount >= 0;
2669 #if 0 // for debugging
2670 printf("*** direction ***\n");
2671 printf("lrCount = %d\n", lrCount);
2672 printf("primaryLR = %d\n", primaryLR);
2673 #endif
2675 //----- column assignment
2677 // sort blocks into xy order for column assignment
2678 blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2679 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2680 blocks[i] = blk;
2682 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2684 // column assignment
2685 for (i = 0; i < nBlocks; ++i) {
2686 blk0 = blocks[i];
2687 col1 = 0;
2688 for (j = 0; j < i; ++j) {
2689 blk1 = blocks[j];
2690 col2 = 0; // make gcc happy
2691 switch (primaryRot) {
2692 case 0:
2693 if (blk0->xMin > blk1->xMax) {
2694 col2 = blk1->col + blk1->nColumns + 3;
2695 } else if (blk1->xMax == blk1->xMin) {
2696 col2 = blk1->col;
2697 } else {
2698 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2699 (blk1->xMax - blk1->xMin)) *
2700 blk1->nColumns);
2702 break;
2703 case 1:
2704 if (blk0->yMin > blk1->yMax) {
2705 col2 = blk1->col + blk1->nColumns + 3;
2706 } else if (blk1->yMax == blk1->yMin) {
2707 col2 = blk1->col;
2708 } else {
2709 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2710 (blk1->yMax - blk1->yMin)) *
2711 blk1->nColumns);
2713 break;
2714 case 2:
2715 if (blk0->xMax < blk1->xMin) {
2716 col2 = blk1->col + blk1->nColumns + 3;
2717 } else if (blk1->xMin == blk1->xMax) {
2718 col2 = blk1->col;
2719 } else {
2720 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2721 (blk1->xMin - blk1->xMax)) *
2722 blk1->nColumns);
2724 break;
2725 case 3:
2726 if (blk0->yMax < blk1->yMin) {
2727 col2 = blk1->col + blk1->nColumns + 3;
2728 } else if (blk1->yMin == blk1->yMax) {
2729 col2 = blk1->col;
2730 } else {
2731 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2732 (blk1->yMin - blk1->yMax)) *
2733 blk1->nColumns);
2735 break;
2737 if (col2 > col1) {
2738 col1 = col2;
2741 blk0->col = col1;
2742 for (line = blk0->lines; line; line = line->next) {
2743 for (j = 0; j <= line->len; ++j) {
2744 line->col[j] += col1;
2749 #if 0 // for debugging
2750 printf("*** blocks, after column assignment ***\n");
2751 for (blk = blkList; blk; blk = blk->next) {
2752 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2753 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2754 blk->nColumns);
2755 for (line = blk->lines; line; line = line->next) {
2756 printf(" line:\n");
2757 for (word0 = line->words; word0; word0 = word0->next) {
2758 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2759 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2760 word0->base, word0->fontSize, word0->spaceAfter);
2761 for (i = 0; i < word0->len; ++i) {
2762 fputc(word0->text[i] & 0xff, stdout);
2764 printf("'\n");
2768 printf("\n");
2769 #endif
2771 //----- reading order sort
2773 // sort blocks into yx order (in preparation for reading order sort)
2774 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
2776 // compute space on left and right sides of each block
2777 for (i = 0; i < nBlocks; ++i) {
2778 blk0 = blocks[i];
2779 for (j = 0; j < nBlocks; ++j) {
2780 blk1 = blocks[j];
2781 if (blk1 != blk0) {
2782 blk0->updatePriMinMax(blk1);
2787 #if 0 // for debugging
2788 printf("*** blocks, after yx sort ***\n");
2789 for (i = 0; i < nBlocks; ++i) {
2790 blk = blocks[i];
2791 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2792 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2793 blk->priMin, blk->priMax);
2794 for (line = blk->lines; line; line = line->next) {
2795 printf(" line:\n");
2796 for (word0 = line->words; word0; word0 = word0->next) {
2797 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2798 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2799 word0->base, word0->fontSize, word0->spaceAfter);
2800 for (j = 0; j < word0->len; ++j) {
2801 fputc(word0->text[j] & 0xff, stdout);
2803 printf("'\n");
2807 printf("\n");
2808 #endif
2810 // build the flows
2811 //~ this needs to be adjusted for writing mode (vertical text)
2812 //~ this also needs to account for right-to-left column ordering
2813 blkArray = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2814 memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
2815 flows = lastFlow = NULL;
2816 firstBlkIdx = 0;
2817 nBlocksLeft = nBlocks;
2818 while (nBlocksLeft > 0) {
2820 // find the upper-left-most block
2821 for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
2822 i = firstBlkIdx;
2823 blk = blkArray[i];
2824 for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
2825 blk1 = blkArray[j];
2826 if (blk1) {
2827 if (blk && blk->secondaryDelta(blk1) > 0) {
2828 break;
2830 if (blk1->primaryCmp(blk) < 0) {
2831 i = j;
2832 blk = blk1;
2836 blkArray[i] = NULL;
2837 --nBlocksLeft;
2838 blk->next = NULL;
2840 // create a new flow, starting with the upper-left-most block
2841 flow = new TextFlow(this, blk);
2842 if (lastFlow) {
2843 lastFlow->next = flow;
2844 } else {
2845 flows = flow;
2847 lastFlow = flow;
2848 fontSize = blk->lines->words->fontSize;
2850 // push the upper-left-most block on the stack
2851 blk->stackNext = NULL;
2852 blkStack = blk;
2854 // find the other blocks in this flow
2855 while (blkStack) {
2857 // find the upper-left-most block under (but within
2858 // maxBlockSpacing of) the top block on the stack
2859 blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
2860 blk = NULL;
2861 i = -1;
2862 for (j = firstBlkIdx; j < nBlocks; ++j) {
2863 blk1 = blkArray[j];
2864 if (blk1) {
2865 if (blkStack->secondaryDelta(blk1) > blkSpace) {
2866 break;
2868 if (blk && blk->secondaryDelta(blk1) > 0) {
2869 break;
2871 if (blk1->isBelow(blkStack) &&
2872 (!blk || blk1->primaryCmp(blk) < 0)) {
2873 i = j;
2874 blk = blk1;
2879 // if a suitable block was found, add it to the flow and push it
2880 // onto the stack
2881 if (blk && flow->blockFits(blk, blkStack)) {
2882 blkArray[i] = NULL;
2883 --nBlocksLeft;
2884 blk->next = NULL;
2885 flow->addBlock(blk);
2886 fontSize = blk->lines->words->fontSize;
2887 blk->stackNext = blkStack;
2888 blkStack = blk;
2890 // otherwise (if there is no block under the top block or the
2891 // block is not suitable), pop the stack
2892 } else {
2893 blkStack = blkStack->stackNext;
2897 gfree(blkArray);
2899 #if 0 // for debugging
2900 printf("*** flows ***\n");
2901 for (flow = flows; flow; flow = flow->next) {
2902 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2903 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
2904 flow->priMin, flow->priMax);
2905 for (blk = flow->blocks; blk; blk = blk->next) {
2906 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2907 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2908 blk->priMin, blk->priMax);
2909 for (line = blk->lines; line; line = line->next) {
2910 printf(" line:\n");
2911 for (word0 = line->words; word0; word0 = word0->next) {
2912 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2913 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2914 word0->base, word0->fontSize, word0->spaceAfter);
2915 for (i = 0; i < word0->len; ++i) {
2916 fputc(word0->text[i] & 0xff, stdout);
2918 printf("'\n");
2923 printf("\n");
2924 #endif
2926 if (uMap) {
2927 uMap->decRefCnt();
2931 GBool TextPage::findText(Unicode *s, int len,
2932 GBool startAtTop, GBool stopAtBottom,
2933 GBool startAtLast, GBool stopAtLast,
2934 GBool caseSensitive, GBool backward,
2935 double *xMin, double *yMin,
2936 double *xMax, double *yMax) {
2937 TextBlock *blk;
2938 TextLine *line;
2939 Unicode *s2, *txt;
2940 Unicode *p;
2941 int txtSize, m, i, j, k;
2942 double xStart, yStart, xStop, yStop;
2943 double xMin0, yMin0, xMax0, yMax0;
2944 double xMin1, yMin1, xMax1, yMax1;
2945 GBool found;
2947 //~ needs to handle right-to-left text
2949 if (rawOrder) {
2950 return gFalse;
2953 // convert the search string to uppercase
2954 if (!caseSensitive) {
2955 s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
2956 for (i = 0; i < len; ++i) {
2957 s2[i] = unicodeToUpper(s[i]);
2959 } else {
2960 s2 = s;
2963 txt = NULL;
2964 txtSize = 0;
2966 xStart = yStart = xStop = yStop = 0;
2967 if (startAtLast && haveLastFind) {
2968 xStart = lastFindXMin;
2969 yStart = lastFindYMin;
2970 } else if (!startAtTop) {
2971 xStart = *xMin;
2972 yStart = *yMin;
2974 if (stopAtLast && haveLastFind) {
2975 xStop = lastFindXMin;
2976 yStop = lastFindYMin;
2977 } else if (!stopAtBottom) {
2978 xStop = *xMax;
2979 yStop = *yMax;
2982 found = gFalse;
2983 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2984 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2986 for (i = backward ? nBlocks - 1 : 0;
2987 backward ? i >= 0 : i < nBlocks;
2988 i += backward ? -1 : 1) {
2989 blk = blocks[i];
2991 // check: is the block above the top limit?
2992 if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
2993 continue;
2996 // check: is the block below the bottom limit?
2997 if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
2998 break;
3001 for (line = blk->lines; line; line = line->next) {
3003 // check: is the line above the top limit?
3004 if (!startAtTop &&
3005 (backward ? line->yMin > yStart : line->yMin < yStart)) {
3006 continue;
3009 // check: is the line below the bottom limit?
3010 if (!stopAtBottom &&
3011 (backward ? line->yMin < yStop : line->yMin > yStop)) {
3012 continue;
3015 // convert the line to uppercase
3016 m = line->len;
3017 if (!caseSensitive) {
3018 if (m > txtSize) {
3019 txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3020 txtSize = m;
3022 for (k = 0; k < m; ++k) {
3023 txt[k] = unicodeToUpper(line->text[k]);
3025 } else {
3026 txt = line->text;
3029 // search each position in this line
3030 j = backward ? m - len : 0;
3031 p = txt + j;
3032 while (backward ? j >= 0 : j <= m - len) {
3034 // compare the strings
3035 for (k = 0; k < len; ++k) {
3036 if (p[k] != s2[k]) {
3037 break;
3041 // found it
3042 if (k == len) {
3043 switch (line->rot) {
3044 case 0:
3045 xMin1 = line->edge[j];
3046 xMax1 = line->edge[j + len];
3047 yMin1 = line->yMin;
3048 yMax1 = line->yMax;
3049 break;
3050 case 1:
3051 xMin1 = line->xMin;
3052 xMax1 = line->xMax;
3053 yMin1 = line->edge[j];
3054 yMax1 = line->edge[j + len];
3055 break;
3056 case 2:
3057 xMin1 = line->edge[j + len];
3058 xMax1 = line->edge[j];
3059 yMin1 = line->yMin;
3060 yMax1 = line->yMax;
3061 break;
3062 case 3:
3063 xMin1 = line->xMin;
3064 xMax1 = line->xMax;
3065 yMin1 = line->edge[j + len];
3066 yMax1 = line->edge[j];
3067 break;
3069 if (backward) {
3070 if ((startAtTop ||
3071 yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3072 (stopAtBottom ||
3073 yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3074 if (!found ||
3075 yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3076 xMin0 = xMin1;
3077 xMax0 = xMax1;
3078 yMin0 = yMin1;
3079 yMax0 = yMax1;
3080 found = gTrue;
3083 } else {
3084 if ((startAtTop ||
3085 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3086 (stopAtBottom ||
3087 yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3088 if (!found ||
3089 yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3090 xMin0 = xMin1;
3091 xMax0 = xMax1;
3092 yMin0 = yMin1;
3093 yMax0 = yMax1;
3094 found = gTrue;
3099 if (backward) {
3100 --j;
3101 --p;
3102 } else {
3103 ++j;
3104 ++p;
3110 if (!caseSensitive) {
3111 gfree(s2);
3112 gfree(txt);
3115 if (found) {
3116 *xMin = xMin0;
3117 *xMax = xMax0;
3118 *yMin = yMin0;
3119 *yMax = yMax0;
3120 lastFindXMin = xMin0;
3121 lastFindYMin = yMin0;
3122 haveLastFind = gTrue;
3123 return gTrue;
3126 return gFalse;
3129 GString *TextPage::getText(double xMin, double yMin,
3130 double xMax, double yMax) {
3131 GString *s;
3132 UnicodeMap *uMap;
3133 GBool isUnicode __unused;
3134 TextBlock *blk;
3135 TextLine *line;
3136 TextLineFrag *frags;
3137 int nFrags, fragsSize;
3138 TextLineFrag *frag;
3139 char space[8], eol[16];
3140 int spaceLen, eolLen;
3141 int lastRot;
3142 double x, y, delta;
3143 int col, idx0, idx1, i, j;
3144 GBool multiLine, oneRot;
3146 s = new GString();
3148 if (rawOrder) {
3149 return s;
3152 // get the output encoding
3153 if (!(uMap = globalParams->getTextEncoding())) {
3154 return s;
3156 isUnicode = uMap->isUnicode();
3157 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3158 eolLen = 0; // make gcc happy
3159 switch (globalParams->getTextEOL()) {
3160 case eolUnix:
3161 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3162 break;
3163 case eolDOS:
3164 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3165 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3166 break;
3167 case eolMac:
3168 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3169 break;
3172 //~ writing mode (horiz/vert)
3174 // collect the line fragments that are in the rectangle
3175 fragsSize = 256;
3176 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3177 nFrags = 0;
3178 lastRot = -1;
3179 oneRot = gTrue;
3180 for (i = 0; i < nBlocks; ++i) {
3181 blk = blocks[i];
3182 if (xMin < blk->xMax && blk->xMin < xMax &&
3183 yMin < blk->yMax && blk->yMin < yMax) {
3184 for (line = blk->lines; line; line = line->next) {
3185 if (xMin < line->xMax && line->xMin < xMax &&
3186 yMin < line->yMax && line->yMin < yMax) {
3187 idx0 = idx1 = -1;
3188 switch (line->rot) {
3189 case 0:
3190 y = 0.5 * (line->yMin + line->yMax);
3191 if (yMin < y && y < yMax) {
3192 j = 0;
3193 while (j < line->len) {
3194 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3195 idx0 = j;
3196 break;
3198 ++j;
3200 j = line->len - 1;
3201 while (j >= 0) {
3202 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3203 idx1 = j;
3204 break;
3206 --j;
3209 break;
3210 case 1:
3211 x = 0.5 * (line->xMin + line->xMax);
3212 if (xMin < x && x < xMax) {
3213 j = 0;
3214 while (j < line->len) {
3215 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3216 idx0 = j;
3217 break;
3219 ++j;
3221 j = line->len - 1;
3222 while (j >= 0) {
3223 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3224 idx1 = j;
3225 break;
3227 --j;
3230 break;
3231 case 2:
3232 y = 0.5 * (line->yMin + line->yMax);
3233 if (yMin < y && y < yMax) {
3234 j = 0;
3235 while (j < line->len) {
3236 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3237 idx0 = j;
3238 break;
3240 ++j;
3242 j = line->len - 1;
3243 while (j >= 0) {
3244 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3245 idx1 = j;
3246 break;
3248 --j;
3251 break;
3252 case 3:
3253 x = 0.5 * (line->xMin + line->xMax);
3254 if (xMin < x && x < xMax) {
3255 j = 0;
3256 while (j < line->len) {
3257 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3258 idx0 = j;
3259 break;
3261 ++j;
3263 j = line->len - 1;
3264 while (j >= 0) {
3265 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3266 idx1 = j;
3267 break;
3269 --j;
3272 break;
3274 if (idx0 >= 0 && idx1 >= 0) {
3275 if (nFrags == fragsSize) {
3276 fragsSize *= 2;
3277 frags = (TextLineFrag *)
3278 greallocn(frags, fragsSize, sizeof(TextLineFrag));
3280 frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
3281 ++nFrags;
3282 if (lastRot >= 0 && line->rot != lastRot) {
3283 oneRot = gFalse;
3285 lastRot = line->rot;
3292 // sort the fragments and generate the string
3293 if (nFrags > 0) {
3295 for (i = 0; i < nFrags; ++i) {
3296 frags[i].computeCoords(oneRot);
3298 assignColumns(frags, nFrags, oneRot);
3300 // if all lines in the region have the same rotation, use it;
3301 // otherwise, use the page's primary rotation
3302 if (oneRot) {
3303 qsort(frags, nFrags, sizeof(TextLineFrag),
3304 &TextLineFrag::cmpYXLineRot);
3305 } else {
3306 qsort(frags, nFrags, sizeof(TextLineFrag),
3307 &TextLineFrag::cmpYXPrimaryRot);
3309 i = 0;
3310 while (i < nFrags) {
3311 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3312 for (j = i+1;
3313 j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3314 ++j) ;
3315 qsort(frags + i, j - i, sizeof(TextLineFrag),
3316 oneRot ? &TextLineFrag::cmpXYColumnLineRot
3317 : &TextLineFrag::cmpXYColumnPrimaryRot);
3318 i = j;
3321 col = 0;
3322 multiLine = gFalse;
3323 for (i = 0; i < nFrags; ++i) {
3324 frag = &frags[i];
3326 // insert a return
3327 if (frag->col < col ||
3328 (i > 0 && fabs(frag->base - frags[i-1].base) >
3329 maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
3330 s->append(eol, eolLen);
3331 col = 0;
3332 multiLine = gTrue;
3335 // column alignment
3336 for (; col < frag->col; ++col) {
3337 s->append(space, spaceLen);
3340 // get the fragment text
3341 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3344 if (multiLine) {
3345 s->append(eol, eolLen);
3349 gfree(frags);
3350 uMap->decRefCnt();
3352 return s;
3355 GBool TextPage::findCharRange(int pos, int length,
3356 double *xMin, double *yMin,
3357 double *xMax, double *yMax) {
3358 TextBlock *blk;
3359 TextLine *line;
3360 TextWord *word;
3361 double xMin0, xMax0, yMin0, yMax0;
3362 double xMin1, xMax1, yMin1, yMax1;
3363 GBool first;
3364 int i, j0, j1;
3366 if (rawOrder) {
3367 return gFalse;
3370 //~ this doesn't correctly handle:
3371 //~ - ranges split across multiple lines (the highlighted region
3372 //~ is the bounding box of all the parts of the range)
3373 //~ - cases where characters don't convert one-to-one into Unicode
3374 first = gTrue;
3375 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3376 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3377 for (i = 0; i < nBlocks; ++i) {
3378 blk = blocks[i];
3379 for (line = blk->lines; line; line = line->next) {
3380 for (word = line->words; word; word = word->next) {
3381 if (pos < word->charPos + word->charLen &&
3382 word->charPos < pos + length) {
3383 j0 = pos - word->charPos;
3384 if (j0 < 0) {
3385 j0 = 0;
3387 j1 = pos + length - 1 - word->charPos;
3388 if (j1 >= word->len) {
3389 j1 = word->len - 1;
3391 switch (line->rot) {
3392 case 0:
3393 xMin1 = word->edge[j0];
3394 xMax1 = word->edge[j1 + 1];
3395 yMin1 = word->yMin;
3396 yMax1 = word->yMax;
3397 break;
3398 case 1:
3399 xMin1 = word->xMin;
3400 xMax1 = word->xMax;
3401 yMin1 = word->edge[j0];
3402 yMax1 = word->edge[j1 + 1];
3403 break;
3404 case 2:
3405 xMin1 = word->edge[j1 + 1];
3406 xMax1 = word->edge[j0];
3407 yMin1 = word->yMin;
3408 yMax1 = word->yMax;
3409 break;
3410 case 3:
3411 xMin1 = word->xMin;
3412 xMax1 = word->xMax;
3413 yMin1 = word->edge[j1 + 1];
3414 yMax1 = word->edge[j0];
3415 break;
3417 if (first || xMin1 < xMin0) {
3418 xMin0 = xMin1;
3420 if (first || xMax1 > xMax0) {
3421 xMax0 = xMax1;
3423 if (first || yMin1 < yMin0) {
3424 yMin0 = yMin1;
3426 if (first || yMax1 > yMax0) {
3427 yMax0 = yMax1;
3429 first = gFalse;
3434 if (!first) {
3435 *xMin = xMin0;
3436 *xMax = xMax0;
3437 *yMin = yMin0;
3438 *yMax = yMax0;
3439 return gTrue;
3441 return gFalse;
3444 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
3445 GBool physLayout) {
3446 UnicodeMap *uMap;
3447 TextFlow *flow;
3448 TextBlock *blk;
3449 TextLine *line;
3450 TextLineFrag *frags;
3451 TextWord *word;
3452 int nFrags, fragsSize;
3453 TextLineFrag *frag;
3454 char space[8], eol[16], eop[8];
3455 int spaceLen, eolLen, eopLen;
3456 GBool pageBreaks;
3457 GString *s;
3458 double delta;
3459 int col, i, j, d, n;
3461 // get the output encoding
3462 if (!(uMap = globalParams->getTextEncoding())) {
3463 return;
3465 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3466 eolLen = 0; // make gcc happy
3467 switch (globalParams->getTextEOL()) {
3468 case eolUnix:
3469 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3470 break;
3471 case eolDOS:
3472 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3473 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3474 break;
3475 case eolMac:
3476 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3477 break;
3479 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
3480 pageBreaks = globalParams->getTextPageBreaks();
3482 //~ writing mode (horiz/vert)
3484 // output the page in raw (content stream) order
3485 if (rawOrder) {
3487 for (word = rawWords; word; word = word->next) {
3488 s = new GString();
3489 dumpFragment(word->text, word->len, uMap, s);
3490 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3491 delete s;
3492 if (word->next &&
3493 fabs(word->next->base - word->base) <
3494 maxIntraLineDelta * word->fontSize) {
3495 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
3496 (*outputFunc)(outputStream, space, spaceLen);
3498 } else {
3499 (*outputFunc)(outputStream, eol, eolLen);
3503 // output the page, maintaining the original physical layout
3504 } else if (physLayout) {
3506 // collect the line fragments for the page and sort them
3507 fragsSize = 256;
3508 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3509 nFrags = 0;
3510 for (i = 0; i < nBlocks; ++i) {
3511 blk = blocks[i];
3512 for (line = blk->lines; line; line = line->next) {
3513 if (nFrags == fragsSize) {
3514 fragsSize *= 2;
3515 frags = (TextLineFrag *)greallocn(frags,
3516 fragsSize, sizeof(TextLineFrag));
3518 frags[nFrags].init(line, 0, line->len);
3519 frags[nFrags].computeCoords(gTrue);
3520 ++nFrags;
3523 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
3524 i = 0;
3525 while (i < nFrags) {
3526 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3527 for (j = i+1;
3528 j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3529 ++j) ;
3530 qsort(frags + i, j - i, sizeof(TextLineFrag),
3531 &TextLineFrag::cmpXYColumnPrimaryRot);
3532 i = j;
3535 #if 0 // for debugging
3536 printf("*** line fragments ***\n");
3537 for (i = 0; i < nFrags; ++i) {
3538 frag = &frags[i];
3539 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
3540 frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
3541 for (n = 0; n < frag->len; ++n) {
3542 fputc(frag->line->text[frag->start + n] & 0xff, stdout);
3544 printf("'\n");
3546 printf("\n");
3547 #endif
3549 // generate output
3550 col = 0;
3551 for (i = 0; i < nFrags; ++i) {
3552 frag = &frags[i];
3554 // column alignment
3555 for (; col < frag->col; ++col) {
3556 (*outputFunc)(outputStream, space, spaceLen);
3559 // print the line
3560 s = new GString();
3561 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3562 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3563 delete s;
3565 // print one or more returns if necessary
3566 if (i == nFrags - 1 ||
3567 frags[i+1].col < col ||
3568 fabs(frags[i+1].base - frag->base) >
3569 maxIntraLineDelta * frag->line->words->fontSize) {
3570 if (i < nFrags - 1) {
3571 d = (int)((frags[i+1].base - frag->base) /
3572 frag->line->words->fontSize);
3573 if (d < 1) {
3574 d = 1;
3575 } else if (d > 5) {
3576 d = 5;
3578 } else {
3579 d = 1;
3581 for (; d > 0; --d) {
3582 (*outputFunc)(outputStream, eol, eolLen);
3584 col = 0;
3588 gfree(frags);
3590 // output the page, "undoing" the layout
3591 } else {
3592 for (flow = flows; flow; flow = flow->next) {
3593 for (blk = flow->blocks; blk; blk = blk->next) {
3594 for (line = blk->lines; line; line = line->next) {
3595 n = line->len;
3596 if (line->hyphenated && (line->next || blk->next)) {
3597 --n;
3599 s = new GString();
3600 dumpFragment(line->text, n, uMap, s);
3601 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3602 delete s;
3603 if (!line->hyphenated) {
3604 if (line->next) {
3605 (*outputFunc)(outputStream, space, spaceLen);
3606 } else if (blk->next) {
3607 //~ this is a bit of a kludge - we should really do a more
3608 //~ intelligent determination of paragraphs
3609 if (blk->next->lines->words->fontSize ==
3610 blk->lines->words->fontSize) {
3611 (*outputFunc)(outputStream, space, spaceLen);
3612 } else {
3613 (*outputFunc)(outputStream, eol, eolLen);
3619 (*outputFunc)(outputStream, eol, eolLen);
3620 (*outputFunc)(outputStream, eol, eolLen);
3624 // end of page
3625 if (pageBreaks) {
3626 (*outputFunc)(outputStream, eop, eopLen);
3629 uMap->decRefCnt();
3632 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
3633 TextLineFrag *frag0, *frag1;
3634 int rot, col1, col2, i, j, k;
3636 // all text in the region has the same rotation -- recompute the
3637 // column numbers based only on the text in the region
3638 if (oneRot) {
3639 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
3640 rot = frags[0].line->rot;
3641 for (i = 0; i < nFrags; ++i) {
3642 frag0 = &frags[i];
3643 col1 = 0;
3644 for (j = 0; j < i; ++j) {
3645 frag1 = &frags[j];
3646 col2 = 0; // make gcc happy
3647 switch (rot) {
3648 case 0:
3649 if (frag0->xMin >= frag1->xMax) {
3650 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3651 frag1->line->col[frag1->start]) + 1;
3652 } else {
3653 for (k = frag1->start;
3654 k < frag1->start + frag1->len &&
3655 frag0->xMin >= 0.5 * (frag1->line->edge[k] +
3656 frag1->line->edge[k+1]);
3657 ++k) ;
3658 col2 = frag1->col +
3659 frag1->line->col[k] - frag1->line->col[frag1->start];
3661 break;
3662 case 1:
3663 if (frag0->yMin >= frag1->yMax) {
3664 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3665 frag1->line->col[frag1->start]) + 1;
3666 } else {
3667 for (k = frag1->start;
3668 k < frag1->start + frag1->len &&
3669 frag0->yMin >= 0.5 * (frag1->line->edge[k] +
3670 frag1->line->edge[k+1]);
3671 ++k) ;
3672 col2 = frag1->col +
3673 frag1->line->col[k] - frag1->line->col[frag1->start];
3675 break;
3676 case 2:
3677 if (frag0->xMax <= frag1->xMin) {
3678 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3679 frag1->line->col[frag1->start]) + 1;
3680 } else {
3681 for (k = frag1->start;
3682 k < frag1->start + frag1->len &&
3683 frag0->xMax <= 0.5 * (frag1->line->edge[k] +
3684 frag1->line->edge[k+1]);
3685 ++k) ;
3686 col2 = frag1->col +
3687 frag1->line->col[k] - frag1->line->col[frag1->start];
3689 break;
3690 case 3:
3691 if (frag0->yMax <= frag1->yMin) {
3692 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3693 frag1->line->col[frag1->start]) + 1;
3694 } else {
3695 for (k = frag1->start;
3696 k < frag1->start + frag1->len &&
3697 frag0->yMax <= 0.5 * (frag1->line->edge[k] +
3698 frag1->line->edge[k+1]);
3699 ++k) ;
3700 col2 = frag1->col +
3701 frag1->line->col[k] - frag1->line->col[frag1->start];
3703 break;
3705 if (col2 > col1) {
3706 col1 = col2;
3709 frag0->col = col1;
3712 // the region includes text at different rotations -- use the
3713 // globally assigned column numbers, offset by the minimum column
3714 // number (i.e., shift everything over to column 0)
3715 } else {
3716 col1 = frags[0].col;
3717 for (i = 1; i < nFrags; ++i) {
3718 if (frags[i].col < col1) {
3719 col1 = frags[i].col;
3722 for (i = 0; i < nFrags; ++i) {
3723 frags[i].col -= col1;
3728 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
3729 GString *s) {
3730 char lre[8], rle[8], popdf[8], buf[8];
3731 int lreLen, rleLen, popdfLen, n;
3732 int nCols, i, j, k;
3734 nCols = 0;
3736 if (uMap->isUnicode()) {
3738 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
3739 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
3740 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
3742 if (primaryLR) {
3744 i = 0;
3745 while (i < len) {
3746 // output a left-to-right section
3747 for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
3748 for (k = i; k < j; ++k) {
3749 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3750 s->append(buf, n);
3751 ++nCols;
3753 i = j;
3754 // output a right-to-left section
3755 for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
3756 if (j > i) {
3757 s->append(rle, rleLen);
3758 for (k = j - 1; k >= i; --k) {
3759 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3760 s->append(buf, n);
3761 ++nCols;
3763 s->append(popdf, popdfLen);
3764 i = j;
3768 } else {
3770 s->append(rle, rleLen);
3771 i = len - 1;
3772 while (i >= 0) {
3773 // output a right-to-left section
3774 for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
3775 for (k = i; k > j; --k) {
3776 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3777 s->append(buf, n);
3778 ++nCols;
3780 i = j;
3781 // output a left-to-right section
3782 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
3783 if (j < i) {
3784 s->append(lre, lreLen);
3785 for (k = j + 1; k <= i; ++k) {
3786 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3787 s->append(buf, n);
3788 ++nCols;
3790 s->append(popdf, popdfLen);
3791 i = j;
3794 s->append(popdf, popdfLen);
3798 } else {
3799 for (i = 0; i < len; ++i) {
3800 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
3801 s->append(buf, n);
3802 nCols += n;
3806 return nCols;
3809 #if TEXTOUT_WORD_LIST
3810 TextWordList *TextPage::makeWordList(GBool physLayout) {
3811 return new TextWordList(this, physLayout);
3813 #endif
3815 //------------------------------------------------------------------------
3816 // TextOutputDev
3817 //------------------------------------------------------------------------
3819 static void outputToFile(void *stream, char *text, int len) {
3820 fwrite(text, 1, len, (FILE *)stream);
3823 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
3824 GBool rawOrderA, GBool append) {
3825 text = NULL;
3826 physLayout = physLayoutA;
3827 rawOrder = rawOrderA;
3828 doHTML = gFalse;
3829 ok = gTrue;
3831 // open file
3832 needClose = gFalse;
3833 if (fileName) {
3834 if (!strcmp(fileName, "-")) {
3835 outputStream = stdout;
3836 #ifdef WIN32
3837 // keep DOS from munging the end-of-line characters
3838 setmode(fileno(stdout), O_BINARY);
3839 #endif
3840 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
3841 needClose = gTrue;
3842 } else {
3843 error(-1, "Couldn't open text file '%s'", fileName);
3844 ok = gFalse;
3845 return;
3847 outputFunc = &outputToFile;
3848 } else {
3849 outputStream = NULL;
3852 // set up text xObject
3853 text = new TextPage(rawOrderA);
3856 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
3857 GBool physLayoutA, GBool rawOrderA) {
3858 outputFunc = func;
3859 outputStream = stream;
3860 needClose = gFalse;
3861 physLayout = physLayoutA;
3862 rawOrder = rawOrderA;
3863 doHTML = gFalse;
3864 text = new TextPage(rawOrderA);
3865 ok = gTrue;
3868 TextOutputDev::~TextOutputDev() {
3869 if (needClose) {
3870 #ifdef MACOS
3871 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
3872 #endif
3873 fclose((FILE *)outputStream);
3875 if (text) {
3876 delete text;
3880 void TextOutputDev::startPage(int pageNum, GfxState *state) {
3881 text->startPage(state);
3884 void TextOutputDev::endPage() {
3885 text->endPage();
3886 text->coalesce(physLayout, doHTML);
3887 if (outputStream) {
3888 text->dump(outputStream, outputFunc, physLayout);
3892 void TextOutputDev::updateFont(GfxState *state) {
3893 text->updateFont(state);
3896 void TextOutputDev::beginString(GfxState *state, GString *s) {
3899 void TextOutputDev::endString(GfxState *state) {
3902 void TextOutputDev::drawChar(GfxState *state, double x, double y,
3903 double dx, double dy,
3904 double originX, double originY,
3905 CharCode c, int nBytes, Unicode *u, int uLen) {
3906 text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
3909 void TextOutputDev::stroke(GfxState *state) {
3910 GfxPath *path;
3911 GfxSubpath *subpath;
3912 double x[2], y[2];
3914 if (!doHTML) {
3915 return;
3917 path = state->getPath();
3918 if (path->getNumSubpaths() != 1) {
3919 return;
3921 subpath = path->getSubpath(0);
3922 if (subpath->getNumPoints() != 2) {
3923 return;
3925 state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
3926 state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
3928 // look for a vertical or horizontal line
3929 if (x[0] == x[1] || y[0] == y[1]) {
3930 text->addUnderline(x[0], y[0], x[1], y[1]);
3934 void TextOutputDev::fill(GfxState *state) {
3935 GfxPath *path;
3936 GfxSubpath *subpath;
3937 double x[5], y[5];
3938 double rx0, ry0, rx1, ry1, t;
3939 int i;
3941 if (!doHTML) {
3942 return;
3944 path = state->getPath();
3945 if (path->getNumSubpaths() != 1) {
3946 return;
3948 subpath = path->getSubpath(0);
3949 if (subpath->getNumPoints() != 5) {
3950 return;
3952 for (i = 0; i < 5; ++i) {
3953 if (subpath->getCurve(i)) {
3954 return;
3956 state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
3959 // look for a rectangle
3960 if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
3961 x[0] == x[4] && y[0] == y[4]) {
3962 rx0 = x[0];
3963 ry0 = y[0];
3964 rx1 = x[2];
3965 ry1 = y[1];
3966 } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
3967 x[0] == x[4] && y[0] == y[4]) {
3968 rx0 = x[0];
3969 ry0 = y[0];
3970 rx1 = x[1];
3971 ry1 = y[2];
3972 } else {
3973 return;
3975 if (rx1 < rx0) {
3976 t = rx0;
3977 rx0 = rx1;
3978 rx1 = t;
3980 if (ry1 < ry0) {
3981 t = ry0;
3982 ry0 = ry1;
3983 ry1 = t;
3986 // skinny horizontal rectangle
3987 if (ry1 - ry0 < rx1 - rx0) {
3988 if (ry1 - ry0 < maxUnderlineWidth) {
3989 ry0 = 0.5 * (ry0 + ry1);
3990 text->addUnderline(rx0, ry0, rx1, ry0);
3993 // skinny vertical rectangle
3994 } else {
3995 if (rx1 - rx0 < maxUnderlineWidth) {
3996 rx0 = 0.5 * (rx0 + rx1);
3997 text->addUnderline(rx0, ry0, rx0, ry1);
4002 void TextOutputDev::eoFill(GfxState *state) {
4003 if (!doHTML) {
4004 return;
4006 fill(state);
4009 void TextOutputDev::processLink(Link *link, Catalog *catalog) {
4010 double x1, y1, x2, y2;
4011 int xMin, yMin, xMax, yMax, x, y;
4013 if (!doHTML) {
4014 return;
4016 link->getRect(&x1, &y1, &x2, &y2);
4017 cvtUserToDev(x1, y1, &x, &y);
4018 xMin = xMax = x;
4019 yMin = yMax = y;
4020 cvtUserToDev(x1, y2, &x, &y);
4021 if (x < xMin) {
4022 xMin = x;
4023 } else if (x > xMax) {
4024 xMax = x;
4026 if (y < yMin) {
4027 yMin = y;
4028 } else if (y > yMax) {
4029 yMax = y;
4031 cvtUserToDev(x2, y1, &x, &y);
4032 if (x < xMin) {
4033 xMin = x;
4034 } else if (x > xMax) {
4035 xMax = x;
4037 if (y < yMin) {
4038 yMin = y;
4039 } else if (y > yMax) {
4040 yMax = y;
4042 cvtUserToDev(x2, y2, &x, &y);
4043 if (x < xMin) {
4044 xMin = x;
4045 } else if (x > xMax) {
4046 xMax = x;
4048 if (y < yMin) {
4049 yMin = y;
4050 } else if (y > yMax) {
4051 yMax = y;
4053 text->addLink(xMin, yMin, xMax, yMax, link);
4056 GBool TextOutputDev::findText(Unicode *s, int len,
4057 GBool startAtTop, GBool stopAtBottom,
4058 GBool startAtLast, GBool stopAtLast,
4059 GBool caseSensitive, GBool backward,
4060 double *xMin, double *yMin,
4061 double *xMax, double *yMax) {
4062 return text->findText(s, len, startAtTop, stopAtBottom,
4063 startAtLast, stopAtLast, caseSensitive, backward,
4064 xMin, yMin, xMax, yMax);
4067 GString *TextOutputDev::getText(double xMin, double yMin,
4068 double xMax, double yMax) {
4069 return text->getText(xMin, yMin, xMax, yMax);
4072 GBool TextOutputDev::findCharRange(int pos, int length,
4073 double *xMin, double *yMin,
4074 double *xMax, double *yMax) {
4075 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
4078 #if TEXTOUT_WORD_LIST
4079 TextWordList *TextOutputDev::makeWordList() {
4080 return text->makeWordList(physLayout);
4082 #endif
4084 TextPage *TextOutputDev::takeText() {
4085 TextPage *ret;
4087 ret = text;
4088 text = new TextPage(rawOrder);
4089 return ret;