




echo Pinyin::trans(‘带着希望去旅行,比到达终点更美好’), “\n”;

//output: “dài zhe xī wàng qù lǔ xíng bǐ dào dá zhōng diǎn gèng měi hǎo”*


class Pinyin



* dictionary path


* @var string


protected $dictionary;


* settings


* @var array


protected static $setting = array(

‘delimiter’ => ‘ ‘,

‘accent’ => true,



* instance


* @var Pinyin


protected static $instance;


* constructor


* set dictionary path.


public function __construct()


ini_set(‘memory_limit’, ‘160M’);

$this->dictionary = __DIR__ . ‘/cedict/cedict_ts.u8′;



* set the dictionary.


* @param array $setting settings.


public static function set(array $setting = array())


self::$setting = array_merge(self::$setting, $setting);



* get Pinyin instance


* @return Pinyin


public static function getInstance()


if (is_null(self::$instance)) {

self::$instance = new self;


return self::$instance;



* chinese to pinyin


* @param string $string source string.

* @param array $setting settings.


* @return string


public static function trans($string, array $setting = array())


$instance = self::getInstance();

// merge setting

empty($setting) || self::set($setting);

$dictionary = $instance->loadDictionary();

// do replace

foreach ($dictionary as $line) {

$string = str_replace($line['simplified'], “{$line['pinyin_marks']} “, $string);

if (!$instance->containsChinese($string)) {




// add accents

if(self::$setting['accent']) {

$string = $instance->pinyin_addaccents(strtolower($string));

} else {

$string = $instance->removeTone(strtolower($string));


// clean the string

$string = $instance->removeUnwantedCharacters($string);

// add delimiter

$string = $instance->addDelimiter($string);

return $instance->escape($string);



* load dictionary content


* @return array


protected function loadDictionary()


$cacheFilename = $this->getCacheFilename($this->dictionary);

// load from cache

if (file_exists($cacheFilename)) {

return $this->loadFromCache($cacheFilename);


// parse and cache

$parsedDictionary = $this->parseDictionary($this->dictionary);

$this->cache($cacheFilename, $parsedDictionary);

return $parsedDictionary;



* get the filename of cache file.


* @param string $dictionary dictionary path.


* @return string


protected function getCacheFilename($dictionary)


is_dir(__DIR__ .’/cache/’) || mkdir(__DIR__ .’/cache/’, 0755, true);

return __DIR__ .’/cache/’ . md5($dictionary);



* parse the dict to php array


* @param string $dictionary path of dictionary file.


* @return array


protected function parseDictionary($dictionary)


//ini_set(‘memory_limit’, ‘180M’);

$dictionary = file($dictionary);

$regex = “#(.*?) (.*?) \[(.*?)\] \/(.*)\/#”;

$content = array();

foreach ($dictionary as $entry) {

if (0 === stripos($entry, ‘#’)) {



preg_match($regex, $entry, $matches);

$content[] = array(

//’traditional’ => $matches[1],

‘simplified’ => $matches[2],

//’pinyin_numbers’ => $matches[3],

‘pinyin_marks’ => $matches[3],

//’translation’ => $this->escape($matches[4]),



// sort by simplified string length.

usort($content, function($a, $b){

if (mb_strlen($a['simplified']) == mb_strlen($b['simplified'])) {

return 0;


return mb_strlen($a['simplified']) < mb_strlen($b['simplified']) ? 1 : -1;


return $content;



* load dictionary from cached file


* @param string $dictionary cached file name


* @return array


protected function loadFromCache($dictionary)


return include $dictionary;



* write array to file


* @param string $filename filename.

* @param array $array parsed dictionary.


* @return void


protected function cache($filename, $array)


file_put_contents($filename, “ ‘u’,

‘/\d/’ => ”,


return preg_replace(array_keys($replacement), $replacement, $string);



* Credits for these 2 functions go to Bouke Versteegh, who shared these

* at http://stackoverflow.com/questions/1598856/convert-numbered-to-accentuated-pinyin


* @param string $string The pinyin string with tone numbers, i.e. “ni3 hao3″


* @return string The formatted string with tone marks, i.e.


protected function pinyin_addaccents($string)


# Find words with a number behind them, and replace with callback fn.

return str_replace(‘u:’, ‘ü’, preg_replace_callback(


array($this, ‘pinyin_addaccents_cb’),



# Helper callback

protected function pinyin_addaccents_cb($match)


static $accentmap = null;

if ($accentmap === null) {

# Where to place the accent marks

$stars =

‘a* e* i* o* u* ü* ‘ .

‘A* E* I* O* U* Ü* ‘ .

‘a*i a*o e*i ia* ia*o ie* io* iu* ‘ .

‘A*I A*O E*I IA* IA*O IE* IO* IU* ‘ .

‘o*u ua* ua*i ue* ui* uo* üe* ‘ .

‘O*U UA* UA*I UE* UI* UO* ÜE*';

$nostars =

‘a e i o u ü ‘ .

‘A E I O U Ü ‘ .

‘ai ao ei ia iao ie io iu ‘ .


‘ou ua uai ue ui uo üe ‘ .


# Build an array like array(‘a’ => ‘a*’) and store statically

$accentmap = array_combine(explode(‘ ‘, $nostars), explode(‘ ‘, $stars));


static $vowels = array(‘a*’, ‘e*’, ‘i*’, ‘o*’, ‘u*’, ‘ü*’, ‘A*’, ‘E*’, ‘I*’, ‘O*’, ‘U*’, ‘Ü*’);

static $pinyin = array(

1 => array(‘ā’, ‘ē’, ‘ī’, ‘ō’, ‘ū’, ‘ǖ’, ‘Ā’, ‘Ē’, ‘Ī’, ‘Ō’, ‘Ū’, ‘Ǖ’),

2 => array(‘á’, ‘é’, ‘í’, ‘ó’, ‘ú’, ‘ǘ’, ‘Á’, ‘É’, ‘Í’, ‘Ó’, ‘Ú’, ‘Ǘ’),

3 => array(‘ǎ’, ‘ě’, ‘ǐ’, ‘ǒ’, ‘ǔ’, ‘ǚ’, ‘Ǎ’, ‘Ě’, ‘Ǐ’, ‘Ǒ’, ‘Ǔ’, ‘Ǚ’),

4 => array(‘à’, ‘è’, ‘ì’, ‘ò’, ‘ù’, ‘ǜ’, ‘À’, ‘È’, ‘Ì’, ‘Ò’, ‘Ù’, ‘Ǜ’),

5 => array(‘a’, ‘e’, ‘i’, ‘o’, ‘u’, ‘ü’, ‘A’, ‘E’, ‘I’, ‘O’, ‘U’, ‘Ü’)


list(, $word, $tone) = $match;

# Add star to vowelcluster

$word = strtr($word, $accentmap);

# Replace starred letter with accented

$word = str_replace($vowels, $pinyin[$tone], $word);

return $word;



