UTF-8 encoding:
*
bytes bits representation
*
1 7 0bbbbbbb
*
2 11 110bbbbb 10bbbbbb
*
3 16 1110bbbb 10bbbbbb 10bbbbbb
*
4 21 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
*
* @author Ronen Botzer
* @param string $source string of Unicode entities
* @param boolean $w3cStyle - true: entity starts with '&' and ends with ';' otherwise starts with '%'
* @param boolean $hasHexVal - is the value of the entity given as hex or decimal (defaults w3c=decimal, MS=hex)
* @return string is the utf-8 encoded string
* @access public
*/
function utf8Encode ($source, $w3cStyle=true, $hasHexVal=false) {
$utf8Str = ''; // holds the resulting utf-8 encoded string
if ($w3cStyle) {
if ($hasHexVal) $delimiter = '&';
else $delimiter = "";
}
else
$delimiter = '%';
$entityArray = explode ($delimiter, $source);
$size = count ($entityArray);
// process each character in the source string of Unicode entities
for ($i = 0; $i < $size; $i++) {
$subStr = $entityArray[$i];
if ($w3cStyle)
$nonEntity = strstr ($subStr, ';');
else
$nonEntity = true;
if ($nonEntity !== false) {
// find the offset of the Unicode character
if ($w3cStyle) {
if ($hasHexVal)
$unicode = hexdec (substr ($subStr, 0, (strpos ($subStr, ';') + 1)));
else
$unicode = intval (substr ($subStr, 0, (strpos ($subStr, ';') + 1)));
}
else {
// in the case of %nn entities grab the first two chars
// and mark the remainder as a non entity (pure ASCII)
// this may not apply for the first element of the array
if ($i > 0) {
$unicode = hexdec (substr ($subStr, 0, 2));
$nonEntity = substr ($subStr, 2);
}
else if (substr ($source, 0, 1) == '%') {
// first element is an entity
$unicode = hexdec (substr ($subStr, 0, 2));
$nonEntity = substr ($subStr, 2);
}
else {
// first element is a non entity
$utf8Str .= $subStr;
continue;
}
}
// determine how many chars are needed to represent this
// Unicode character by examining in which range the
// position value of the Unicode character falls.
// see figure 3.
if ($unicode < 128) {
// We have an ASCII character. Simply add it
$utf8Substring = chr ($unicode);
}
else if ($unicode >= 128 && $unicode < 2048) {
// This Unicode character will map to a two character
// multi-byte sequence
$binVal = str_pad (decbin ($unicode), 11, "0", STR_PAD_LEFT);
// chop the binary representation of the position value
// into two parts which will be used to fill in the xxx
// bits described in figure 3.
$binPart1 = substr ($binVal, 0, 5);
$binPart2 = substr ($binVal, 5);
// assemble the multi-byte sequence which represents
// the Unicode character
$char1 = chr (192 + bindec ($binPart1));
$char2 = chr (128 + bindec ($binPart2));
$utf8Substring = $char1 . $char2;
}
else if ($unicode >= 2048 && $unicode < 65536) {
// This Unicode character will map to a three character
// multi-byte sequence
$binVal = str_pad (decbin ($unicode), 16, "0", STR_PAD_LEFT);
// chop the binary representation of the position value
// into three parts which will be used to fill in the
// xxx bits described in figure 3.
$binPart1 = substr ($binVal, 0, 4);
$binPart2 = substr ($binVal, 4, 6);
$binPart3 = substr ($binVal, 10);
// assemble the multi-byte sequence which represents
// the Unicode character
$char1 = chr (224 + bindec ($binPart1));
$char2 = chr (128 + bindec ($binPart2));
$char3 = chr (128 + bindec ($binPart3));
$utf8Substring = $char1 . $char2 . $char3;
}
else {
// This Unicode character will map to a four character
// multi-byte sequence
$binVal = str_pad (decbin ($unicode), 21, "0", STR_PAD_LEFT);
// chop the binary representation of the position value
// into four parts which will be used to fill in the
// xxx bits described in figure 3.
$binPart1 = substr ($binVal, 0, 3);
$binPart2 = substr ($binVal, 3, 6);
$binPart3 = substr ($binVal, 9, 6);
$binPart4 = substr ($binVal, 15);
// assemble the multi-byte sequence which represents
// the Unicode character
$char1 = chr (240 + bindec ($binPart1));
$char2 = chr (128 + bindec ($binPart2));
$char3 = chr (128 + bindec ($binPart3));
$char4 = chr (128 + bindec ($binPart4));
$utf8Substring = $char1 . $char2 . $char3 . $char4;
}
if ($w3cStyle) {
if (strlen ($nonEntity) > 1) {
// chop the first char (';')
$nonEntity = substr ($nonEntity, 1);
}
else $nonEntity = '';
}
$utf8Str .= $utf8Substring . $nonEntity;
}
else {
$utf8Str .= $subStr;
}
}
return $utf8Str;
}