整理了汉字的拼音数据,我们取其中的pinyin.txt再做些处理,让其更加易用。
预处理
pinyin.txt看起来是这样的(部分内容):
U+3007: líng # 〇U+3400: qiū # 㐀U+3401: tiàn # 㐁U+3404: kuà # 㐄U+3405: wǔ # 㐅U+3406: yǐn # 㐆U+340C: yí # 㐌U+3416: xié # 㐖
用下面的python脚本处理一下:
# coding: utf-8import jsonimport codecstable = {}for line in open('./pinyin.txt', 'r'): line = line.strip() if len(line) == 0: continue if '#' not in line: raise Exception('no # in line: '+line) ls = line.split('#') hanzi = ls[1].strip() unicode_pys = ls[0].split(':') unicode_code = unicode_pys[0].strip() pys = unicode_pys[1].strip() if ',' in pys: pys = [item.strip() for item in pys.split(',')] else: pys = [pys] table[hanzi] = pysprint 'gen pinyin.db'with open('pinyin.db', 'w') as out: s = '' for hanzi in table: s = s + hanzi + '=' + ','.join(table[hanzi]) + '\n' out.write(s)print 'gen pinyin.json'with open('pinyin.json', 'w') as out: json.dump(table, out, ensure_ascii=False, indent=4)print 'finish'
得到的pinyin.db的部分内容如下:
?=qiáng?=náng?=zhǎn?=yǒng?=tà?=xiè,wén?=ǒu?=xiàng?=guó
pinyin.json文件是json格式数据:
{ "?": [ "gǔ" ], "?": [ "gǒng" ], "?": [ "lǐ" ], "?": [ "gǔ" ], // ..... "?": [ "yì" ], "煭": [ "liè" ], "煬": [ "yáng", "yàng" ]}
json数据可以直接拿到nodejs等程序中使用。
在上面给出的转换脚本中生成json数据的一个代码片段如下:
with open('pinyin.json', 'w') as out: json.dump(table, out, ensure_ascii=False, indent=4)
indent参数是为了让json在文件中以较为美观的形式保存起来。ensure_ascii参数设置为False是为了让unicode字符不保存成下面这样:
{ "\ud856\udf9d": [ "g\u01d4" ], "\ud856\uddc2": [ "g\u01d2ng" ], "\ud869\udd74": [ "l\u01d0" ], "\ud869\udd77": [ "g\u01d4" ], // ......
java工具类
eclipse下:
android studio下:
SimplePinyin.java源码:
package me.letiantian.simplepinyin;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Map;public class SimplePinyin { private SimplePinyin() {} public static final MapTABLE = SimplePinyin.getPinyinResource(); public static Map getPinyinResource() { Map map = new HashMap (); InputStream is = SimplePinyin.class.getClassLoader().getResourceAsStream("pinyin.db"); BufferedReader br; try { br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String line = null; while ((line = br.readLine()) != null) { String[] tokens = line.trim().split("="); map.put(tokens[0], tokens[1]); } br.close(); } catch (IOException e) { e.printStackTrace(); } return map; } public static String getPinyin(char c) { String sChar = String.valueOf(c); return TABLE.get(sChar); } public static String getPinyin(String s) { return TABLE.get(s); }}
示例(test.java):
package me.letiantian.simplepinyin;public class test { public static void main(String[] args) { System.out.println(SimplePinyin.getPinyin('乐')); System.out.println(SimplePinyin.getPinyin("了")); System.out.println(SimplePinyin.getPinyin("了ad")); }}
运行结果:
lè,yuèle,liǎo,liàonull