PostgreSQL的漢字轉拼音 · postgresql手冊

線上產品升級需要導入一批3W多的用戶數據，問題卡在了用戶的漢字拼音碼上，Excel也能實現大部分漢字的拼音轉換，但還有很多生僻字無法轉換。下面的漢字轉拼音功能是在德哥的BLOG上的主題函數上做了一些簡化和外加做了一層嵌套實現我本地的功能。需求：軟件需要根據用戶的拼音碼(中文的拼音縮寫)輸入來顯示用戶姓名，比如輸入ZS就能出來一串類似張三、張松之類的用戶名。 DB版本9.3? **數據準備:** ~~~ ?--漢字和拼音以及拼音首字母的對照表 create table pinyin (hz varchar(1),py varchar(6),zm varchar(1)); --索引以及唯一約束，視情況怎么加 create index idx_pinyin_hz on pinyin(hz); --create unique index idx_pinyin_hz_py on pinyin(hz,py); --create unique index idx_pinyin_hz_zm on pinyin(hz,zm); --未收錄漢字插入以下表 create table new_discover (hz varchar(1) primary key,py varchar(6),zm varchar(1)); ~~~ **函數準備:** ~~~ --創建輸出type和函數,函數返回是數組 CREATE TYPE t_py_zm as (c1 text[],c2 text[]); CREATE OR REPLACE FUNCTION get_py_zm(i_hz text) RETURNS SETOF t_py_zm AS $BODY$ DECLARE v_hz text; i int; v_sql1 text; v_sql2 text; v_sql3 text; v_sql4 text; v_sql text; v_max_id int; v_id int; BEGIN --創建臨時表用來存儲每個漢字和字母 set client_min_messages = warning; CREATE TEMPORARY TABLE IF NOT EXISTS tmp_get_py_zm (id int,py varchar(6),zm varchar(1)) ON COMMIT DELETE ROWS; truncate table tmp_get_py_zm; i := 0; --拆分輸入參數為每個字符并插入到臨時表里 for v_hz in select regexp_split_to_table(i_hz,'') loop if ascii(v_hz) > 255 then insert into tmp_get_py_zm select i,py,zm from pinyin where hz=v_hz; else insert into tmp_get_py_zm values(i,v_hz,v_hz); end if; if not found then perform 1 from new_discover where hz = v_hz; if not found then insert into new_discover(hz) values(v_hz); end if; insert into tmp_get_py_zm values(i,'?','?'); end if; i := i+1; end loop; select max(id) into v_max_id from tmp_get_py_zm; if v_max_id > 0 then v_sql1 := ''; v_sql3 := ''; v_sql4 := ''; v_id := 0; for v_id in select generate_series(0,v_max_id) loop if v_id <> v_max_id then v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id||','; v_sql3 := v_sql3||'t'||v_id||'.py::text||'; v_sql4 := v_sql4||'t'||v_id||'.zm::text||'; else v_sql1 := v_sql1||'(select py,zm from tmp_get_py_zm where id='||v_id||') as t'||v_id; v_sql3 := v_sql3||'t'||v_id||'.py::text'; v_sql4 := v_sql4||'t'||v_id||'.zm::text'; v_sql := 'select array_agg('||v_sql3||'),array_agg('||v_sql4||') from '||v_sql1; end if; end loop; else v_sql := 'select array_agg(py::text),array_agg(zm::text) from tmp_get_py_zm'; end if; return query execute v_sql; return; END; $BODY$ LANGUAGE plpgsql VOLATILE COST 100 ROWS 1000; ALTER FUNCTION get_py_zm(text) OWNER TO postgres; --上面這個函數比我預期的功能要強大，除了輸出漢字的拼音縮寫外還提供了全拼，效果如下 postgres=# select * from get_py_zm('我愛你'); c1 | c2 ----------+------- {woaini} | {wan} (1 row) ~~~ 我的需求只需要輸出簡寫，故外面再套一層循環更新函數并轉換數組為字符類型，省得去更新原來的函數 ~~~ CREATE OR REPLACE FUNCTION f_update_pinyin() RETURNS VOID AS $BODY$ declare v_value text; i_hn text; rec record; begin --where后面的條件是篩選出不全為拼音的數據，也可以不用該條件全量更新,tmp_kenyon是我要更新的表 for rec in select num,name from tmp_kenyon where pym !~ E'[A-Z][A-Z][A-Z]' and pym !~ E'[A-Z][A-Z]' loop i_hn:=rec.name; select c2[1]::text into v_value from get_py_zm(i_hn); update tmp_kenyon set pym = v_value where num = rec.num; end loop; return; end; $BODY$ LANGUAGE plpgsql VOLATILE COST 100; ALTER FUNCTION f_update_pin() OWNER TO postgres; --調用該函數時直接使用select f_update_pin(); --如果遇到詞庫沒有的漢字需要在new_discovery里面更新并最終補充到pinyin詞庫表里去 ~~~ pinyin詞庫表整理在了下面的云盤地址里，目前收錄了將近7000個常用漢字，地址在： http://pan.baidu.com/s/1pJ6spSn 導入方式? [postgres@db1 ~]$ psql? psql (9.2.4)? Type "help" for help.? postgres=# \i /home/postgres/py.sql? **參考：**?http://blog.163.com/digoal@126/blog/static/163877040201241452827379/