Skip to content

Commit b9a899b

Browse files
committed
支持𩽾𩾌(ān kāng)之类的补充字符集 fix hankcs#1564
1 parent 2577426 commit b9a899b

File tree

3 files changed

+24
-4
lines changed

3 files changed

+24
-4
lines changed

data/dictionary/pinyin/pinyin.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30724,4 +30724,5 @@
3072430724
龥=yue4
3072530725
重启=chong2,qi3
3072630726
还款=huan2,kuan3
30727-
侠传=xia2,zhuan4
30727+
侠传=xia2,zhuan4
30728+
𩽾𩾌=an1,kang1

src/main/java/com/hankcs/hanlp/dictionary/py/PinyinDictionary.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,15 +180,17 @@ protected static List<Pinyin> segLongest(char[] charArray, AhoCorasickDoubleArra
180180
protected static List<Pinyin> segLongest(char[] charArray, AhoCorasickDoubleArrayTrie<Pinyin[]> trie, boolean remainNone)
181181
{
182182
final Pinyin[][] wordNet = new Pinyin[charArray.length][];
183+
final int[] lengths = new int[charArray.length];
183184
trie.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<Pinyin[]>()
184185
{
185186
@Override
186187
public void hit(int begin, int end, Pinyin[] value)
187188
{
188189
int length = end - begin;
189-
if (wordNet[begin] == null || length > wordNet[begin].length)
190+
if (length > lengths[begin])
190191
{
191-
wordNet[begin] = length == 1 ? new Pinyin[]{value[0]} : value;
192+
wordNet[begin] = value;
193+
lengths[begin] = length;
192194
}
193195
}
194196
});
@@ -208,7 +210,7 @@ public void hit(int begin, int end, Pinyin[] value)
208210
{
209211
pinyinList.add(pinyin);
210212
}
211-
offset += wordNet[offset].length;
213+
offset += lengths[offset];
212214
}
213215
return pinyinList;
214216
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package com.hankcs.hanlp.dictionary.py;
2+
3+
import com.hankcs.hanlp.HanLP;
4+
import junit.framework.TestCase;
5+
6+
import java.util.Arrays;
7+
8+
public class PinyinDictionaryTest extends TestCase
9+
{
10+
11+
public void testGet()
12+
{
13+
System.out.println(Arrays.toString(PinyinDictionary.get("鼖")));
14+
System.out.println(PinyinDictionary.convertToPinyin("\uD867\uDF7E\uD867\uDF8C"));
15+
System.out.println(HanLP.convertToPinyinList("\uD867\uDF7E\uD867\uDF8C"));
16+
}
17+
}

0 commit comments

Comments
 (0)