|
| 1 | +/* |
| 2 | + * <author>Han He</author> |
| 3 | + * <email>me@hankcs.com</email> |
| 4 | + * <create-date>2018-07-29 4:18 PM</create-date> |
| 5 | + * |
| 6 | + * <copyright file="DemoCRFNER.java"> |
| 7 | + * Copyright (c) 2018, Han He. All Rights Reserved, http://www.hankcs.com/ |
| 8 | + * This source is subject to Han He. Please contact Han He for more information. |
| 9 | + * </copyright> |
| 10 | + */ |
| 11 | +package com.hankcs.book.ch08; |
| 12 | + |
| 13 | +import com.hankcs.hanlp.corpus.io.IOUtil; |
| 14 | +import com.hankcs.hanlp.model.crf.CRFNERecognizer; |
| 15 | +import com.hankcs.hanlp.tokenizer.lexical.NERecognizer; |
| 16 | + |
| 17 | +import java.io.IOException; |
| 18 | + |
| 19 | +import static com.hankcs.book.ch08.DemoPlane.PLANE_CORPUS; |
| 20 | +import static com.hankcs.book.ch08.DemoPlane.PLANE_MODEL; |
| 21 | + |
| 22 | + |
| 23 | +/** |
| 24 | + * 《自然语言处理入门》8.6.2 训练领域模型 (书本之外的补充试验) |
| 25 | + * 配套书籍:http://nlp.hankcs.com/book.php |
| 26 | + * 讨论答疑:https://bbs.hankcs.com/ |
| 27 | + * |
| 28 | + * @author hankcs |
| 29 | + * @see <a href="http://nlp.hankcs.com/book.php">《自然语言处理入门》</a> |
| 30 | + * @see <a href="https://bbs.hankcs.com/">讨论答疑</a> |
| 31 | + */ |
| 32 | +public class DemoCRFNERPlane |
| 33 | +{ |
| 34 | + public static void main(String[] args) throws IOException |
| 35 | + { |
| 36 | + NERecognizer recognizer = train(PLANE_CORPUS, PLANE_MODEL); |
| 37 | + String[] wordArray = {"歼", "-", "7", "战斗机", "正是", "仿照", "米格", "-", "21", "而", "制", "。"}; // 构造单词序列 |
| 38 | + String[] posArray = {"v", "w", "w", "n", "d", "v", "nr", "w", "m", "c", "v", "w"}; // 构造词性序列 |
| 39 | + String[] nerTagArray = recognizer.recognize(wordArray, posArray); // 序列标注 |
| 40 | + for (int i = 0; i < wordArray.length; i++) |
| 41 | + System.out.printf("%-4s\t%s\t%s\t\n", wordArray[i], posArray[i], nerTagArray[i]); |
| 42 | + } |
| 43 | + |
| 44 | + public static NERecognizer train(String corpus, String model) throws IOException |
| 45 | + { |
| 46 | + if (IOUtil.isFileExisted(model + ".txt")) // 若存在CRF++训练结果,则直接加载 |
| 47 | + return new CRFNERecognizer(model + ".txt"); |
| 48 | + CRFNERecognizer recognizer = new CRFNERecognizer(null); // 空白 |
| 49 | + recognizer.tagSet.nerLabels.clear(); // 不识别nr、ns、nt |
| 50 | + recognizer.tagSet.nerLabels.add("np"); // 目标是识别np |
| 51 | + recognizer.train(corpus, model); |
| 52 | + recognizer = new CRFNERecognizer(model + ".txt"); |
| 53 | + return recognizer; |
| 54 | + } |
| 55 | +} |
0 commit comments