source/libs/poppler/poppler-0.32.0/poppler/UTF.cc

   1 //========================================================================
   2 //
   3 // UTF.h
   4 //
   5 // Copyright 2001-2003 Glyph & Cog, LLC
   6 //
   7 //========================================================================
   8
   9 //========================================================================
  10 //
  11 // Modified under the Poppler project - http://poppler.freedesktop.org
  12 //
  13 // All changes made under the Poppler project to this file are licensed
  14 // under GPL version 2 or later
  15 //
  16 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
  17 // Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
  18 // Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
  19 //
  20 // To see a description of the changes please see the Changelog file that
  21 // came with your tarball or type make ChangeLog if you are building from git
  22 //
  23 //========================================================================
  24
  25 #include "goo/gmem.h"
  26 #include "PDFDocEncoding.h"
  27 #include "UTF.h"
  28
  29 bool UnicodeIsValid(Unicode ucs4)
  30 {
  31   return (ucs4 < 0x110000) &&
  32     ((ucs4 & 0xfffff800) != 0xd800) &&
  33     (ucs4 < 0xfdd0 || ucs4 > 0xfdef) &&
  34     ((ucs4 & 0xfffe) != 0xfffe);
  35 }
  36
  37 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
  38 {
  39   int i, n, len;
  40   Unicode *u;
  41
  42   // count characters
  43   len = 0;
  44   for (i = 0; i < utf16Len; i++) {
  45     if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
  46         utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
  47       i++; /* surrogate pair */
  48     }
  49     len++;
  50   }
  51   if (ucs4 == NULL)
  52     return len;
  53
  54   u = (Unicode*)gmallocn(len, sizeof(Unicode));
  55   n = 0;
  56   // convert string
  57   for (i = 0; i < utf16Len; i++) {
  58     if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
  59       if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
  60         /* next code is a low surrogate */
  61         u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
  62         ++i;
  63       } else {
  64         /* missing low surrogate
  65            replace it with REPLACEMENT CHARACTER (U+FFFD) */
  66         u[n] = 0xfffd;
  67       }
  68     } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
  69       /* invalid low surrogate
  70          replace it with REPLACEMENT CHARACTER (U+FFFD) */
  71       u[n] = 0xfffd;
  72     } else {
  73       u[n] = utf16[i];
  74     }
  75     if (!UnicodeIsValid(u[n])) {
  76       u[n] = 0xfffd;
  77     }
  78     n++;
  79   }
  80   *ucs4 = u;
  81   return len;
  82 }
  83
  84 int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
  85 {
  86   int i, len;
  87   const char *s;
  88   Unicode *u;
  89
  90   len = textStr->getLength();
  91   s = textStr->getCString();
  92   if (len == 0)
  93     return 0;
  94
  95   if (textStr->hasUnicodeMarker()) {
  96     Unicode *utf16;
  97     len = len/2 - 1;
  98     if (len > 0) {
  99       utf16 = new Unicode[len];
 100       for (i = 0 ; i < len; i++) {
 101         utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
 102       }
 103       len = UTF16toUCS4(utf16, len, &u);
 104       delete[] utf16;
 105     } else {
 106       u = NULL;
 107     }
 108   } else {
 109     u = (Unicode*)gmallocn(len, sizeof(Unicode));
 110     for (i = 0 ; i < len; i++) {
 111       u[i] = pdfDocEncoding[s[i] & 0xff];
 112     }
 113   }
 114   *ucs4 = u;
 115   return len;
 116 }