Imported from antiword-0.37.tar.gz.
[antiword.git] / findtext.c
blob20724a5e7bc47c02172ed0012ecac1c46ed0fcc6
1 /*
2 * findtext.c
3 * Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
5 * Description:
6 * Find the blocks that contain the text of MS Word files
7 */
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include "antiword.h"
15 * bAddTextBlocks - Add the blocks to the text block list
17 * Returns TRUE when successful, FALSE if not
19 BOOL
20 bAddTextBlocks(ULONG ulCharPosFirst, ULONG ulTotalLength,
21 BOOL bUsesUnicode, USHORT usPropMod,
22 ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen)
24 text_block_type tTextBlock;
25 ULONG ulCharPos, ulOffset, ulIndex;
26 long lToGo;
28 fail(ulTotalLength > (ULONG)LONG_MAX / 2);
29 fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN);
30 fail(aulBBD == NULL);
32 NO_DBG_HEX(ulCharPosFirst);
33 NO_DBG_DEC(ulTotalLength);
35 if (bUsesUnicode) {
36 /* One character equals two bytes */
37 NO_DBG_MSG("Uses Unicode");
38 lToGo = (long)ulTotalLength * 2;
39 } else {
40 /* One character equals one byte */
41 NO_DBG_MSG("Uses ASCII");
42 lToGo = (long)ulTotalLength;
45 ulCharPos = ulCharPosFirst;
46 ulOffset = ulCharPosFirst;
47 for (ulIndex = ulStartBlock;
48 ulIndex != END_OF_CHAIN && lToGo > 0;
49 ulIndex = aulBBD[ulIndex]) {
50 if (ulIndex >= (ULONG)tBBDLen) {
51 DBG_DEC(ulIndex);
52 DBG_DEC(tBBDLen);
53 werr(1, "The Big Block Depot is damaged");
55 if (ulOffset >= BIG_BLOCK_SIZE) {
56 ulOffset -= BIG_BLOCK_SIZE;
57 continue;
59 tTextBlock.ulFileOffset =
60 (ulIndex + 1) * BIG_BLOCK_SIZE + ulOffset;
61 tTextBlock.ulCharPos = ulCharPos;
62 tTextBlock.ulLength = min(BIG_BLOCK_SIZE - ulOffset,
63 (ULONG)lToGo);
64 tTextBlock.bUsesUnicode = bUsesUnicode;
65 tTextBlock.usPropMod = usPropMod;
66 ulOffset = 0;
67 if (!bAdd2TextBlockList(&tTextBlock)) {
68 DBG_HEX(tTextBlock.ulFileOffset);
69 DBG_HEX(tTextBlock.ulCharPos);
70 DBG_DEC(tTextBlock.ulLength);
71 DBG_DEC(tTextBlock.bUsesUnicode);
72 DBG_DEC(tTextBlock.usPropMod);
73 return FALSE;
75 ulCharPos += tTextBlock.ulLength;
76 lToGo -= (long)tTextBlock.ulLength;
78 DBG_DEC_C(lToGo != 0, lToGo);
79 return lToGo == 0;
80 } /* end of bAddTextBlocks */
83 * bGet6DocumentText - make a list of the text blocks of Word 6/7 files
85 * Code for "fast saved" files.
87 * Returns TRUE when successful, FALSE if not
89 BOOL
90 bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, ULONG ulStartBlock,
91 const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader)
93 UCHAR *aucBuffer;
94 ULONG ulBeginTextInfo, ulTextOffset, ulTotLength;
95 size_t tTextInfoLen;
96 int iIndex, iType, iOff, iLen, iPieces;
97 USHORT usPropMod;
99 DBG_MSG("bGet6DocumentText");
101 fail(pFile == NULL);
102 fail(aulBBD == NULL);
103 fail(aucHeader == NULL);
105 ulBeginTextInfo = ulGetLong(0x160, aucHeader); /* fcClx */
106 DBG_HEX(ulBeginTextInfo);
107 tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader); /* lcbClx */
108 DBG_DEC(tTextInfoLen);
110 aucBuffer = xmalloc(tTextInfoLen);
111 if (!bReadBuffer(pFile, ulStartBlock,
112 aulBBD, tBBDLen, BIG_BLOCK_SIZE,
113 aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
114 aucBuffer = xfree(aucBuffer);
115 return FALSE;
117 NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
119 iOff = 0;
120 while ((size_t)iOff < tTextInfoLen) {
121 iType = (int)ucGetByte(iOff, aucBuffer);
122 iOff++;
123 if (iType == 0) {
124 DBG_FIXME();
125 iOff++;
126 continue;
128 if (iType == 1) {
129 iLen = (int)usGetWord(iOff, aucBuffer);
130 vAdd2PropModList(aucBuffer + iOff);
131 iOff += iLen + 2;
132 continue;
134 if (iType != 2) {
135 werr(0, "Unknown type of 'fastsaved' format");
136 aucBuffer = xfree(aucBuffer);
137 return FALSE;
139 /* Type 2 */
140 iLen = (int)usGetWord(iOff, aucBuffer);
141 NO_DBG_DEC(iLen);
142 iOff += 4;
143 iPieces = (iLen - 4) / 12;
144 DBG_DEC(iPieces);
145 for (iIndex = 0; iIndex < iPieces; iIndex++) {
146 ulTextOffset = ulGetLong(
147 iOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
148 aucBuffer);
149 usPropMod = usGetWord(
150 iOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
151 aucBuffer);
152 ulTotLength = ulGetLong(iOff + (iIndex + 1) * 4,
153 aucBuffer) -
154 ulGetLong(iOff + iIndex * 4,
155 aucBuffer);
156 NO_DBG_HEX_C(usPropMod != 0, usPropMod);
157 if (!bAddTextBlocks(ulTextOffset, ulTotLength,
158 bUsesUnicode, usPropMod,
159 ulStartBlock,
160 aulBBD, tBBDLen)) {
161 aucBuffer = xfree(aucBuffer);
162 return FALSE;
165 break;
167 aucBuffer = xfree(aucBuffer);
168 return TRUE;
169 } /* end of bGet6DocumentText */
172 * bGet8DocumentText - make a list of the text blocks of Word 8/97 files
174 * Returns TRUE when successful, FALSE if not
176 BOOL
177 bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
178 const ULONG *aulBBD, size_t tBBDLen,
179 const ULONG *aulSBD, size_t tSBDLen,
180 const UCHAR *aucHeader)
182 const ULONG *aulBlockDepot;
183 UCHAR *aucBuffer;
184 ULONG ulTextOffset, ulBeginTextInfo;
185 ULONG ulTotLength, ulLen;
186 long lIndex, lPieces, lOff;
187 size_t tTextInfoLen, tBlockDepotLen, tBlockSize;
188 int iType, iLen;
189 BOOL bUsesUnicode;
190 USHORT usPropMod;
192 DBG_MSG("bGet8DocumentText");
194 fail(pFile == NULL || pPPS == NULL);
195 fail(aulBBD == NULL || aulSBD == NULL);
196 fail(aucHeader == NULL);
198 ulBeginTextInfo = ulGetLong(0x1a2, aucHeader); /* fcClx */
199 DBG_HEX(ulBeginTextInfo);
200 tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader); /* lcbClx */
201 DBG_DEC(tTextInfoLen);
203 DBG_DEC(pPPS->tTable.ulSB);
204 DBG_HEX(pPPS->tTable.ulSize);
205 if (pPPS->tTable.ulSize == 0) {
206 return FALSE;
209 if (pPPS->tTable.ulSize < MIN_SIZE_FOR_BBD_USE) {
210 /* Use the Small Block Depot */
211 aulBlockDepot = aulSBD;
212 tBlockDepotLen = tSBDLen;
213 tBlockSize = SMALL_BLOCK_SIZE;
214 } else {
215 /* Use the Big Block Depot */
216 aulBlockDepot = aulBBD;
217 tBlockDepotLen = tBBDLen;
218 tBlockSize = BIG_BLOCK_SIZE;
220 aucBuffer = xmalloc(tTextInfoLen);
221 if (!bReadBuffer(pFile, pPPS->tTable.ulSB,
222 aulBlockDepot, tBlockDepotLen, tBlockSize,
223 aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
224 aucBuffer = xfree(aucBuffer);
225 return FALSE;
227 NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
229 lOff = 0;
230 while (lOff < (long)tTextInfoLen) {
231 iType = (int)ucGetByte(lOff, aucBuffer);
232 lOff++;
233 if (iType == 0) {
234 DBG_FIXME();
235 lOff++;
236 continue;
238 if (iType == 1) {
239 iLen = (int)usGetWord(lOff, aucBuffer);
240 vAdd2PropModList(aucBuffer + lOff);
241 lOff += (long)iLen + 2;
242 continue;
244 if (iType != 2) {
245 werr(0, "Unknown type of 'fastsaved' format");
246 aucBuffer = xfree(aucBuffer);
247 return FALSE;
249 /* Type 2 */
250 ulLen = ulGetLong(lOff, aucBuffer);
251 if (ulLen < 4) {
252 DBG_DEC(ulLen);
253 return FALSE;
255 lOff += 4;
256 lPieces = (long)((ulLen - 4) / 12);
257 DBG_DEC(lPieces);
258 for (lIndex = 0; lIndex < lPieces; lIndex++) {
259 ulTextOffset = ulGetLong(
260 lOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
261 aucBuffer);
262 usPropMod = usGetWord(
263 lOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
264 aucBuffer);
265 ulTotLength = ulGetLong(lOff + (lIndex + 1) * 4,
266 aucBuffer) -
267 ulGetLong(lOff + lIndex * 4,
268 aucBuffer);
269 if ((ulTextOffset & BIT(30)) == 0) {
270 bUsesUnicode = TRUE;
271 } else {
272 bUsesUnicode = FALSE;
273 ulTextOffset &= ~BIT(30);
274 ulTextOffset /= 2;
276 NO_DBG_HEX_C(usPropMod != 0, usPropMod);
277 if (!bAddTextBlocks(ulTextOffset, ulTotLength,
278 bUsesUnicode, usPropMod,
279 pPPS->tWordDocument.ulSB,
280 aulBBD, tBBDLen)) {
281 aucBuffer = xfree(aucBuffer);
282 return FALSE;
285 break;
287 aucBuffer = xfree(aucBuffer);
288 return TRUE;
289 } /* end of bGet8DocumentText */