使用maven引入相關(guān)的jar
1
2
3
4
5
|
< dependency > < groupId >com.belerweb</ groupId > < artifactId >pinyin4j</ artifactId > < version >2.5.1</ version > </ dependency > |
創(chuàng)建Pinyin4jUtil
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
|
package com.os.core.util.solr; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import java.util.ArrayList; import java.util.Hashtable; import java.util.List; import java.util.Map; /** * 漢語拼音工具類 * Created by PengSongHe on 2017/2/9 0009. */ public class Pinyin4jUtil { public static void main(String[] args) { String str = "測試"; String pinyin = Pinyin4jUtil.converterToSpell(str); System.out.println(str + " pin yin :" + pinyin); pinyin = Pinyin4jUtil.converterToFirstSpell(str); System.out.println(str + " short pin yin :" + pinyin); } /** * 漢字轉(zhuǎn)換位漢語拼音首字母,英文字符不變,特殊字符丟失 支持多音字,生成方式如(長沙市長:cssc,zssz,zssc,cssz) * * @param chines 漢字 * @return 拼音 */ public static String converterToFirstSpell(String chines) { StringBuffer pinyinName = new StringBuffer(); char[] nameChar = chines.toCharArray(); HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); for (int i = 0; i < nameChar.length ; i++) { if (nameChar[i] > 128) { try { // 取得當(dāng)前漢字的所有全拼 String[] strs = PinyinHelper.toHanyuPinyinStringArray( nameChar[i], defaultFormat); if (strs != null) { for (int j = 0; j < strs.length ; j++) { // 取首字母 pinyinName.append(strs[j].charAt(0)); if (j != strs.length - 1) { pinyinName.append(","); } } } // else { // pinyinName.append(nameChar[i]); // } } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } } else { pinyinName.append(nameChar[i]); } pinyinName.append(" "); } // return pinyinName.toString(); return parseTheChineseByObject(discountTheChinese(pinyinName.toString())); } /** * 漢字轉(zhuǎn)換位漢語全拼,英文字符不變,特殊字符丟失 * 支持多音字,生成方式如(重當(dāng)參:zhongdangcen,zhongdangcan,chongdangcen * ,chongdangshen,zhongdangshen,chongdangcan) * * @param chines 漢字 * @return 拼音 */ public static String converterToSpell(String chines) { StringBuffer pinyinName = new StringBuffer(); char[] nameChar = chines .toCharArray(); HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); for (int i = 0 ; i < nameChar.length; i++) { if (nameChar[i] > 128) { try { // 取得當(dāng)前漢字的所有全拼 String[] strs = PinyinHelper.toHanyuPinyinStringArray( nameChar[i], defaultFormat); if (strs != null) { for (int j = 0; j < strs.length ; j++) { pinyinName.append(strs[j]); if (j != strs.length - 1) { pinyinName.append(","); } } } } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } } else { pinyinName.append(nameChar[i]); } pinyinName.append(" "); } // return pinyinName.toString(); return parseTheChineseByObject(discountTheChinese(pinyinName.toString())); } /** * 去除多音字重復(fù)數(shù)據(jù) * * @param theStr * @return */ private static List<Map<String, Integer>> discountTheChinese(String theStr) { // 去除重復(fù)拼音后的拼音列表 List< Map <String, Integer>> mapList = new ArrayList< Map <String, Integer>>(); // 用于處理每個字的多音字,去掉重復(fù) Map< String , Integer> onlyOne = null; String[] firsts = theStr.split(" "); // 讀出每個漢字的拼音 for (String str : firsts) { onlyOne = new Hashtable< String , Integer>(); String[] china = str.split(","); // 多音字處理 for (String s : china) { Integer count = onlyOne.get(s); if (count == null) { onlyOne.put(s, new Integer(1)); } else { onlyOne.remove(s); count++; onlyOne.put(s, count); } } mapList.add(onlyOne); } return mapList; } /** * 解析并組合拼音,對象合并方案(推薦使用) * * @return */ private static String parseTheChineseByObject( List< Map <String, Integer>> list) { Map< String , Integer> first = null; // 用于統(tǒng)計每一次,集合組合數(shù)據(jù) // 遍歷每一組集合 for (int i = 0; i < list.size (); i++) { // 每一組集合與上一次組合的Map Map<String, Integer> temp = new Hashtable< String , Integer>(); // 第一次循環(huán),first為空 if (first != null) { // 取出上次組合與此次集合的字符,并保存 for (String s : first.keySet()) { for (String s1 : list.get(i).keySet()) { String str = s + s1; temp.put(str, 1); } } // 清理上一次組合數(shù)據(jù) if (temp != null && temp.size() > 0) { first.clear(); } } else { for (String s : list.get(i).keySet()) { String str = s; temp.put(str, 1); } } // 保存組合數(shù)據(jù)以便下次循環(huán)使用 if (temp != null && temp.size() > 0) { first = temp; } } String returnStr = ""; if (first != null) { // 遍歷取出組合字符串 for (String str : first.keySet()) { returnStr += (str + ","); } } if (returnStr.length() > 0) { returnStr = returnStr.substring(0, returnStr.length() - 1); } return returnStr; } } |
以上這篇使用Pinyin4j進(jìn)行拼音分詞的方法就是小編分享給大家的全部內(nèi)容了,希望能給大家一個參考,也希望大家多多支持服務(wù)器之家。
原文鏈接:http://blog.csdn.net/psh18513234633/article/details/78839078