inc/check_UTF8.php

   1 <?php
   2 /* ***** BEGIN LICENSE BLOCK *****
   3  * Version: NPL 1.1/GPL 2.0/LGPL 2.1
   4  *
   5  * The contents of this file are subject to the Netscape Public License
   6  * Version 1.1 (the "License"); you may not use this file except in
   7  * compliance with the License. You may obtain a copy of the License at
   8  * http://www.mozilla.org/NPL/
   9  *
  10  * Software distributed under the License is distributed on an "AS IS" basis,
  11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12  * for the specific language governing rights and limitations under the
  13  * License.
  14  *
  15  * The Original Code is Mozilla Communicator client code.
  16  *
  17  * The Initial Developer of the Original Code is
  18  * Netscape Communications Corporation.
  19  * Portions created by the Initial Developer are Copyright (C) 1998
  20  * the Initial Developer. All Rights Reserved.
  21  *
  22  * Contributor(s):
  23  * Henri Sivonen, hsivonen@iki.fi
  24  *
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either the GNU General Public License Version 2 or later (the "GPL"), or
  28  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the NPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the NPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40 /*
  41  * For the original C++ code, see
  42  * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  43  * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  44  *
  45  * The latest version of this file can be obtained from
  46  * http://iki.fi/hsivonen/php-utf8/
  47  *
  48  * Version 1.0, 2003-05-30
  49  */
  50
  51 /**
  52  * Takes an UTF-8 string and returns an array of ints representing the
  53  * Unicode characters. Astral planes are supported ie. the ints in the
  54  * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  55  * are not allowed.
  56  *
  57  * Returns false if the input string isn't a valid UTF-8 octet sequence.
  58  */
  59 function utf8ToUnicode(&$str)
  60 {
  61   $mState = 0;     // cached expected number of octets after the current octet
  62                    // until the beginning of the next UTF8 character sequence
  63   $mUcs4  = 0;     // cached Unicode character
  64   $mBytes = 1;     // cached expected number of octets in the current sequence
  65
  66   $out = array();
  67
  68   $len = strlen($str);
  69   for($i = 0; $i < $len; $i++) {
  70     $in = ord($str{$i});
  71     if (0 == $mState) {
  72       // When mState is zero we expect either a US-ASCII character or a
  73       // multi-octet sequence.
  74       if (0 == (0x80 & ($in))) {
  75         // US-ASCII, pass straight through.
  76         $out[] = $in;
  77         $mBytes = 1;
  78       } else if (0xC0 == (0xE0 & ($in))) {
  79         // First octet of 2 octet sequence
  80         $mUcs4 = ($in);
  81         $mUcs4 = ($mUcs4 & 0x1F) << 6;
  82         $mState = 1;
  83         $mBytes = 2;
  84       } else if (0xE0 == (0xF0 & ($in))) {
  85         // First octet of 3 octet sequence
  86         $mUcs4 = ($in);
  87         $mUcs4 = ($mUcs4 & 0x0F) << 12;
  88         $mState = 2;
  89         $mBytes = 3;
  90       } else if (0xF0 == (0xF8 & ($in))) {
  91         // First octet of 4 octet sequence
  92         $mUcs4 = ($in);
  93         $mUcs4 = ($mUcs4 & 0x07) << 18;
  94         $mState = 3;
  95         $mBytes = 4;
  96       } else if (0xF8 == (0xFC & ($in))) {
  97         /* First octet of 5 octet sequence.
  98          *
  99          * This is illegal because the encoded codepoint must be either
 100          * (a) not the shortest form or
 101          * (b) outside the Unicode range of 0-0x10FFFF.
 102          * Rather than trying to resynchronize, we will carry on until the end
 103          * of the sequence and let the later error handling code catch it.
 104          */
 105         $mUcs4 = ($in);
 106         $mUcs4 = ($mUcs4 & 0x03) << 24;
 107         $mState = 4;
 108         $mBytes = 5;
 109       } else if (0xFC == (0xFE & ($in))) {
 110         // First octet of 6 octet sequence, see comments for 5 octet sequence.
 111         $mUcs4 = ($in);
 112         $mUcs4 = ($mUcs4 & 1) << 30;
 113         $mState = 5;
 114         $mBytes = 6;
 115       } else {
 116         /* Current octet is neither in the US-ASCII range nor a legal first
 117          * octet of a multi-octet sequence.
 118          */
 119         return false;
 120       }
 121     } else {
 122       // When mState is non-zero, we expect a continuation of the multi-octet
 123       // sequence
 124       if (0x80 == (0xC0 & ($in))) {
 125         // Legal continuation.
 126         $shift = ($mState - 1) * 6;
 127         $tmp = $in;
 128         $tmp = ($tmp & 0x0000003F) << $shift;
 129         $mUcs4 |= $tmp;
 130
 131         if (0 == --$mState) {
 132           /* End of the multi-octet sequence. mUcs4 now contains the final
 133            * Unicode codepoint to be output
 134            *
 135            * Check for illegal sequences and codepoints.
 136            */
 137
 138           // From Unicode 3.1, non-shortest form is illegal
 139           if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 140               ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 141               ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 142               (4 < $mBytes) ||
 143               // From Unicode 3.2, surrogate characters are illegal
 144               (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 145               // Codepoints outside the Unicode range are illegal
 146               ($mUcs4 > 0x10FFFF)) {
 147             return false;
 148           }
 149           if (0xFEFF != $mUcs4) {
 150             // BOM is legal but we don't want to output it
 151             $out[] = $mUcs4;
 152           }
 153           //initialize UTF8 cache
 154           $mState = 0;
 155           $mUcs4  = 0;
 156           $mBytes = 1;
 157         }
 158       } else {
 159         /* ((0xC0 & (*in) != 0x80) && (mState != 0))
 160          *
 161          * Incomplete multi-octet sequence.
 162          */
 163         return false;
 164       }
 165     }
 166   }
 167   return $out;
 168 }
 169
 170 /**
 171  * Takes an array of ints representing the Unicode characters and returns
 172  * a UTF-8 string. Astral planes are supported ie. the ints in the
 173  * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 174  * are not allowed.
 175  *
 176  * Returns false if the input array contains ints that represent
 177  * surrogates or are outside the Unicode range.
 178  */
 179 function unicodeToUtf8(&$arr)
 180 {
 181   $dest = '';
 182   foreach ($arr as $src) {
 183     if($src < 0) {
 184       return false;
 185     } else if ( $src <= 0x007f) {
 186       $dest .= chr($src);
 187     } else if ($src <= 0x07ff) {
 188       $dest .= chr(0xc0 | ($src >> 6));
 189       $dest .= chr(0x80 | ($src & 0x003f));
 190     } else if($src == 0xFEFF) {
 191       // nop -- zap the BOM
 192     } else if ($src >= 0xD800 && $src <= 0xDFFF) {
 193       // found a surrogate
 194       return false;
 195     } else if ($src <= 0xffff) {
 196       $dest .= chr(0xe0 | ($src >> 12));
 197       $dest .= chr(0x80 | (($src >> 6) & 0x003f));
 198       $dest .= chr(0x80 | ($src & 0x003f));
 199     } else if ($src <= 0x10ffff) {
 200       $dest .= chr(0xf0 | ($src >> 18));
 201       $dest .= chr(0x80 | (($src >> 12) & 0x3f));
 202       $dest .= chr(0x80 | (($src >> 6) & 0x3f));
 203       $dest .= chr(0x80 | ($src & 0x3f));
 204     } else {
 205       // out of range
 206       return false;
 207     }
 208   }
 209   return $dest;
 210 }
 211 function check_string($ics){
 212     $ics_file = explode("\n",$ics);
 213     foreach($ics_file as $line => $str){
 214         if(false === utf8ToUnicode($str)){
 215             $error[] = $line;
 216         }
 217     }
 218     if(isset($error) && is_array($error)){
 219         foreach($error as $line){
 220             dbg_error_log( "LOG check_string","error on lines %  invalid character in string %s" , ($line +1),$ics_file[$line]  );
 221             return false;
 222         }
 223     } else {
 224 //        dbg_error_log( "LOG check_string","the string is UTF8 compliant");
 225         return true;
 226     }
 227 }
 228 ?>