usr/src/cmd/man/src/util/nsgmls.src/lib/UnicodeCodingSystem.cxx

   1 // Copyright (c) 1994 James Clark
   2 // See the file COPYING for copying permission.
   3 #pragma ident   "%Z%%M% %I%     %E% SMI"
   4
   5 #include "splib.h"
   6
   7 #ifdef SP_MULTI_BYTE
   8
   9 #include "UnicodeCodingSystem.h"
  10 #include "macros.h"
  11 #include "Owner.h"
  12
  13 #include <stddef.h>
  14 #include <string.h>
  15 #ifdef DECLARE_MEMMOVE
  16 extern "C" {
  17   void *memmove(void *, const void *, size_t);
  18 }
  19 #endif
  20
  21 #ifdef SP_NAMESPACE
  22 namespace SP_NAMESPACE {
  23 #endif
  24
  25 const unsigned short byteOrderMark = 0xfeff;
  26 const unsigned short swappedByteOrderMark = 0xfffe;
  27
  28 class UnicodeDecoder : public Decoder {
  29 public:
  30   UnicodeDecoder(const InputCodingSystem *sub);
  31   size_t decode(Char *to, const char *from, size_t fromLen,
  32                 const char **rest);
  33   Boolean convertOffset(unsigned long &offset) const;
  34 private:
  35   PackedBoolean hadFirstChar_;
  36   PackedBoolean hadByteOrderMark_;
  37   PackedBoolean swapBytes_;
  38   Owner<Decoder> subDecoder_;
  39   const InputCodingSystem *subCodingSystem_;
  40 };
  41
  42 class UnicodeEncoder : public Encoder {
  43 public:
  44   UnicodeEncoder();
  45   ~UnicodeEncoder();
  46   void output(Char *, size_t, OutputByteStream *);
  47   void output(const Char *, size_t, OutputByteStream *);
  48   void startFile(OutputByteStream *);
  49 private:
  50   void allocBuf(size_t);
  51   unsigned short *buf_;
  52   size_t bufSize_;
  53 };
  54
  55 UnicodeCodingSystem::UnicodeCodingSystem(const InputCodingSystem *sub)
  56 : sub_(sub)
  57 {
  58 }
  59
  60 Decoder *UnicodeCodingSystem::makeDecoder() const
  61 {
  62   return new UnicodeDecoder(sub_);
  63 }
  64
  65 Encoder *UnicodeCodingSystem::makeEncoder() const
  66 {
  67   return new UnicodeEncoder;
  68 }
  69
  70 unsigned UnicodeCodingSystem::fixedBytesPerChar() const
  71 {
  72   return 2;
  73 }
  74
  75 UnicodeDecoder::UnicodeDecoder(const InputCodingSystem *subCodingSystem)
  76 : Decoder(subCodingSystem ? 1 : 2), subCodingSystem_(subCodingSystem),
  77   hadByteOrderMark_(0), hadFirstChar_(0), swapBytes_(0)
  78 {
  79 }
  80
  81
  82 size_t UnicodeDecoder::decode(Char *to, const char *from, size_t fromLen,
  83                               const char **rest)
  84 {
  85   union U {
  86     unsigned short word;
  87     char bytes[2];
  88   };
  89
  90   if (subDecoder_)
  91     return subDecoder_->decode(to, from, fromLen, rest);
  92   if (!hadFirstChar_) {
  93     if (fromLen < 2) {
  94       *rest = from;
  95       return 0;
  96     }
  97     hadFirstChar_ = 1;
  98     minBytesPerChar_ = 2;
  99     U u;
 100     u.bytes[0] = from[0];
 101     u.bytes[1] = from[1];
 102     if (u.word == byteOrderMark) {
 103       hadByteOrderMark_ = 1;
 104       from += 2;
 105       fromLen -= 2;
 106     }
 107     else if (u.word == swappedByteOrderMark) {
 108       hadByteOrderMark_ = 1;
 109       from += 2;
 110       fromLen -= 2;
 111       swapBytes_ = 1;
 112     }
 113     else if (subCodingSystem_) {
 114       subDecoder_ = subCodingSystem_->makeDecoder();
 115       minBytesPerChar_ = subDecoder_->minBytesPerChar();
 116       return subDecoder_->decode(to, from, fromLen, rest);
 117     }
 118   }
 119   fromLen &= ~1;
 120   *rest = from + fromLen;
 121   if (sizeof(Char) == 2) {
 122     if (!swapBytes_) {
 123       if (from != (char *)to)
 124         memmove(to, from, fromLen);
 125       return fromLen/2;
 126     }
 127   }
 128   if (swapBytes_) {
 129     for (size_t n = fromLen; n > 0; n -= 2) {
 130       U u;
 131       u.bytes[1] = *from++;
 132       u.bytes[0] = *from++;
 133       *to++ = u.word;
 134     }
 135   }
 136   else  {
 137     for (size_t n = fromLen; n > 0; n -= 2) {
 138       U u;
 139       u.bytes[0] = *from++;
 140       u.bytes[1] = *from++;
 141       *to++ = u.word;
 142     }
 143   }
 144   return fromLen/2;
 145 }
 146
 147 Boolean UnicodeDecoder::convertOffset(unsigned long &n) const
 148 {
 149   if (subDecoder_)
 150     return subDecoder_->convertOffset(n);
 151   if (hadByteOrderMark_)
 152     n += 1;
 153   n *= 2;
 154   return true;
 155 }
 156
 157 UnicodeEncoder::UnicodeEncoder()
 158 : buf_(0), bufSize_(0)
 159 {
 160 }
 161
 162 UnicodeEncoder::~UnicodeEncoder()
 163 {
 164   delete [] buf_;
 165 }
 166
 167 void UnicodeEncoder::allocBuf(size_t n)
 168 {
 169   if (bufSize_ < n) {
 170     delete [] buf_;
 171     buf_ = new unsigned short[bufSize_ = n];
 172   }
 173 }
 174
 175 void UnicodeEncoder::startFile(OutputByteStream *sb)
 176 {
 177   const unsigned short n = byteOrderMark;
 178   sb->sputn((char *)&n, 2);
 179 }
 180
 181 void UnicodeEncoder::output(Char *s, size_t n, OutputByteStream *sb)
 182 {
 183   if (sizeof(Char) == 2) {
 184     sb->sputn((char *)s, n*2);
 185     return;
 186   }
 187   ASSERT(sizeof(Char) >= 2);
 188   unsigned short *p = (unsigned short *)s;
 189   for (size_t i = 0; i < n; i++)
 190     p[i] = s[i] & 0xffff;
 191   sb->sputn((char *)s, n*2);
 192 }
 193
 194 void UnicodeEncoder::output(const Char *s, size_t n, OutputByteStream *sb)
 195 {
 196   if (sizeof(Char) == 2) {
 197     sb->sputn((char *)s, n*2);
 198     return;
 199   }
 200   allocBuf(n);
 201   for (size_t i = 0; i < n; i++)
 202     buf_[i] = s[i] & 0xffff;
 203   sb->sputn((char *)buf_, n*2);
 204 }
 205
 206 #ifdef SP_NAMESPACE
 207 }
 208 #endif
 209
 210 #else /* not SP_MULTI_BYTE */
 211
 212 #ifndef __GNUG__
 213 static char non_empty_translation_unit; // sigh
 214 #endif
 215
 216 #endif /* not SP_MULTI_BYTE */