186 lines
5.5 KiB
PHP
186 lines
5.5 KiB
PHP
<?php
|
|
// File name: zh2py.inc.php
|
|
// Description: PHP subroutines to convert Chinese to pinyin
|
|
// Date: 2004-05-15
|
|
// Author: imacat <imacat@pristine.com.tw>
|
|
// Copyright: Copyright (C) 2004-2007 Pristine Communications
|
|
|
|
// We use GDBM but not SQLite. SQLite is cross-platform, but it is a lot
|
|
// slower than GDBM. SQLite only wins GDBM in connection open speed.
|
|
|
|
// Set the include path
|
|
if (!defined("INCPATH_SET")) {
|
|
require_once dirname(__FILE__) . "/incpath.inc.php";
|
|
}
|
|
// Referenced subroutines
|
|
require_once "monica/addslash.inc.php";
|
|
|
|
// Settings
|
|
if (!defined("DBMDIR")) {
|
|
define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m"));
|
|
}
|
|
define("_ZH2PY_DB", DBMDIR . "/zh2py.db");
|
|
define("_ZH2PY_CHARPAT", "(?:[a-z]|\\xC3[\\xAA\\xBC]){1,6}[1-5]");
|
|
$_ZH2PY = null;
|
|
|
|
// zh2py: Convert Chinese to pinyin
|
|
function zh2py($chinese)
|
|
{
|
|
// Use first result from zh2pys()
|
|
$pinyins = zh2pys($chinese);
|
|
return $pinyins[0];
|
|
}
|
|
|
|
// zh2pys: Convert Chinese to pinyin, return all possibly pinyins
|
|
function zh2pys($chinese)
|
|
{
|
|
// Bounce the empty text
|
|
if ($chinese == "") {
|
|
return "";
|
|
}
|
|
|
|
// Split text into Chinese or non-Chinese piecess
|
|
$pieces = _zh2py_split_text($chinese);
|
|
|
|
// Convert each piece into a proper printf pattern
|
|
$chars = array();
|
|
for ($i = 0; $i < count($pieces); $i++) {
|
|
// A Chinese piece
|
|
if ($pieces[$i]["is_chinese"]) {
|
|
$patterns = array();
|
|
for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) {
|
|
$char = mb_substr($pieces[$i]["text"], $j, 1);
|
|
$chars[] = $char;
|
|
$patterns[] = "%s";
|
|
}
|
|
$pieces[$i]["text"] = implode(" ", $patterns);
|
|
// A non-Chinese piece
|
|
} else {
|
|
// Escape the printf metacharacter
|
|
$pieces[$i]["text"] = str_replace("%", "%%", $pieces[$i]["text"]);
|
|
}
|
|
}
|
|
|
|
// Concatenate text pieces
|
|
$pinyin = $pieces[0]["text"];
|
|
for ($i = 1; $i < count($pieces); $i++) {
|
|
// Insert a space
|
|
if ( !preg_match("/\s$/", $pieces[$i-1]["text"])
|
|
&& !preg_match("/^\s/", $pieces[$i]["text"])) {
|
|
$pinyin .= " ";
|
|
}
|
|
$pinyin .= $pieces[$i]["text"];
|
|
}
|
|
|
|
// Get all the possible pinyins
|
|
$chars = _zh2py_chars2py($chars);
|
|
|
|
$pinyins = array();
|
|
for ($i = 0; $i < count($chars); $i++) {
|
|
$pinyins[] = vsprintf($pinyin, $chars[$i]);
|
|
}
|
|
|
|
return $pinyins;
|
|
}
|
|
|
|
// pinyin_match_chinese: If the pinyin matches the Chinese
|
|
function pinyin_match_chinese($chinese, $pinyin)
|
|
{
|
|
// Split text into Chinese or non-Chinese piecess
|
|
$pieces = _zh2py_split_text($chinese);
|
|
|
|
// Convert each piece into a proper perl regular expression pattern
|
|
for ($i = 0; $i < count($pieces); $i++) {
|
|
// A Chinese piece
|
|
if ($pieces[$i]["is_chinese"]) {
|
|
$patterns = array();
|
|
for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) {
|
|
$patterns[] = _ZH2PY_CHARPAT;
|
|
}
|
|
$pieces[$i]["text"] = implode(" ", $patterns);
|
|
// A non-Chinese piece
|
|
} else {
|
|
// Escape the perl regular expression metacharacter
|
|
$pieces[$i]["text"] = addslashes_re_php($pieces[$i]["text"]);
|
|
}
|
|
}
|
|
|
|
// Concatenate text pieces
|
|
$pattern = $pieces[0]["text"];
|
|
for ($i = 1; $i < count($pieces); $i++) {
|
|
// Insert a space
|
|
if ( !preg_match("/(?:\s|\\\\[rnt])$/", $pieces[$i-1]["text"])
|
|
&& !preg_match("/^(?:\s|\\\\[rnt])/", $pieces[$i]["text"])) {
|
|
$pattern .= " ";
|
|
}
|
|
$pattern .= $pieces[$i]["text"];
|
|
}
|
|
|
|
return preg_match("/^$pattern$/", $pinyin)? true: false;
|
|
}
|
|
|
|
// _zh2py_split_text: Split text into Chinese or non-Chinese piecess
|
|
function _zh2py_split_text($text)
|
|
{
|
|
global $_ZH2PY;
|
|
// Start the database
|
|
if (is_null($_ZH2PY)) {
|
|
$_ZH2PY = dba_open(_ZH2PY_DB, "r", "gdbm");
|
|
}
|
|
|
|
// Split into pieces
|
|
for ($i = 0, $chars = array(); $i < mb_strlen($text); $i++) {
|
|
$chars[] = mb_substr($text, $i, 1);
|
|
}
|
|
$pieces = array();
|
|
// Tag the first phrase
|
|
$pieces[] = array(
|
|
"is_chinese" => dba_exists($chars[0], $_ZH2PY),
|
|
"text" => "",
|
|
);
|
|
foreach ($chars as $char) {
|
|
// Chinese status changed
|
|
if (dba_exists($char, $_ZH2PY) xor $pieces[count($pieces)-1]["is_chinese"]) {
|
|
// Start a new piece
|
|
$pieces[] = array(
|
|
"is_chinese" => dba_exists($char, $_ZH2PY),
|
|
"text" => $char,
|
|
);
|
|
} else {
|
|
// Append to the current piece
|
|
$pieces[count($pieces)-1]["text"] .= $char;
|
|
}
|
|
}
|
|
return $pieces;
|
|
}
|
|
|
|
// _zh2py_chars2py: Loop up a series of Chinese characters
|
|
// and return all possible pinyins
|
|
function _zh2py_chars2py($chars)
|
|
{
|
|
global $_ZH2PY;
|
|
|
|
// No more characters to work with
|
|
if (count($chars) == 0) {
|
|
return array(array());
|
|
}
|
|
|
|
$char = array_shift($chars);
|
|
$pinyins = explode("|", dba_fetch($char, $_ZH2PY));
|
|
$follows = _zh2py_chars2py($chars);
|
|
$results = array();
|
|
for ($i = 0; $i < count($pinyins); $i++) {
|
|
for ($j = 0; $j < count($follows); $j++) {
|
|
$results[] = array_merge(array($pinyins[$i]), $follows[$j]);
|
|
}
|
|
}
|
|
|
|
return $results;
|
|
}
|
|
|
|
// The SQLite version - we are not using it
|
|
// SQLite is not as efficient as GDBM in this case.
|
|
// See unused.inc.php
|
|
|
|
?>
|