Source for file UTF8.php
Documentation is available at UTF8.php
* KSC5601 UTF8 internal API for pure code
* @subpackage KSC5601_pure
* @author JoungKyun.Kim <http://oops.org>
* @copyright (c) 2015, JoungKyun.Kim
* @link http://pear.oops.org/package/KSC5601
* import High level API for convert character set
require_once 'KSC5601/Stream.php';
* Only PHP don't support iconv/mbstring, UCS2.php is needed
* import API class that controls UCS2
require_once 'KSC5601/UCS2.php';
* If needless USC2.php, define dummy class for compotable
* UTF8 controle class api
* Whether print debug message
// {{{ function rm_utf8bom ($s)
* remove utf8 bom code (first 3byte)
* @param string Given strings
if ( ord ($s[0]) == 0xef && ord ($s[1]) == 0xbb && ord ($s[2]) == 0xbf )
// {{{ function is_utf8 ($s, $ascii)
* whether utf8 or not given strings
* @return boolean If given strings ars utf-8, return true
* @param string Given strings
* @param boolean Check whether is ascii only or not
function is_utf8 ($s, $ascii = false) {
if ( ord ($s[0]) == 0xef && ord ($s[1]) == 0xbb && ord ($s[2]) == 0xbf )
for ( $i= 0; $i< $l; $i++ ) {
# if single byte charactors, skipped
if ( ! (ord ($s[$i]) & 0x80) )
# first byte of utf8 is must start 11
if ( substr ($first, 0, 2) == '10' )
* 2 byte UTF-8 check is skip, because some hangle is over wrapping 2byte utf-8
* 2byte: 1100000x (10xxxxxx)
* 3byte: 11100000 10xxxxxx (10xxxxxx)
* 4byte: 11110000 10xxxxxx (10xxxxxx 10xxxxxx)
* 5byte: 11111000 10xxxxxx (10xxxxxx 10xxxxxx 10xxxxxx)
* 6byte: 11111100 10xxx0xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
for ( $j= 1; $j< $byte; $j++ ) {
if ( KSC5601_Stream::chr2bin ($s[$i+1], ">>$n") != KSC5601_Stream::check2byte ($byte) )
if ( $ascii && $ascii_status )
// {{{ function utf8enc ($s)
* @return string UTF-8 strings
* @param string Given UHC strings
for ( $i= 0; $i< $len; $i++ ) {
if ( ord ($s[$i]) & 0x80 ) {
$uni[0] = $this->decbin ($ucs2 >> 12);
$uni[1] = $this->decbin ($ucs2 >> 8 & 0x0f);
$uni[2] = $this->decbin ($ucs2 >> 4 & 0x00f);
$uni[3] = $this->decbin ($ucs2 & 0x000f);
$uc1 = bindec ('1110' . $uni[0]);
// {{{ function utf8dec ($s)
* @return string UHC strings
* @param string Given UTF-8 strings
for ( $i= 0; $i< $l; $i++ ) {
if ( ord ($s[$i]) & 0x80 ) {
$uni2 = ord ($s[$i + 1]);
$uni3 = ord ($s[$i + 2]);
$ucs2 = dechex ($uni1 & 0x0f) .
dechex ((($uni2 & 0x03) << 2) | (($uni3 & 0x30) >> 4)) .
#1111(1111).11(1111)(11).11(11)(1111)
echo 'HEX STR => ' . $ucs2 . "\n";
echo '0 => ' . $ucs2[0] . ' ' . decbin (hexdec ($ucs2[0])) . "\n";
echo '1 => ' . $ucs2[1] . ' ' . decbin (hexdec ($ucs2[1])) . "\n";
echo '2 => ' . $ucs2[2] . ' ' . decbin (hexdec ($ucs2[2])) . "\n";
echo '3 => ' . $ucs2[3] . ' ' . decbin (hexdec ($ucs2[3])) . "\n";
* vim600: noet sw=4 ts=4 fdm=marker
* vim<600: noet sw=4 ts=4
|