当前位置: 首页>>代码示例>>Java>>正文


Java Term类代码示例

本文整理汇总了Java中com.hankcs.hanlp.seg.common.Term的典型用法代码示例。如果您正苦于以下问题:Java Term类的具体用法?Java Term怎么用?Java Term使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Term类属于com.hankcs.hanlp.seg.common包,在下文中一共展示了Term类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: extractNameEntity

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
public Map<String, Set<String>> extractNameEntity(String content) {
	//对文本进行分词
	List<Term> list = SEGMENT.seg(content);
	
	//识别人名
	Set<String> nrList = list.stream().filter(term -> term.nature.startsWith("nr"))
			.map(term -> term.word).collect(Collectors.toSet());
	//识别地名
	Set<String> nsList = list.stream().filter(term -> term.nature.startsWith("ns"))
			.map(term -> term.word).collect(Collectors.toSet());
	//识别机构名
    Set<String> ntList = list.stream().filter(term -> term.nature.startsWith("nt"))
    		.map(term -> term.word).collect(Collectors.toSet());
    Map<String, Set<String>> namedEntity = new HashMap<>();
    namedEntity.put("nr", nrList);
    namedEntity.put("ns", nsList);
    namedEntity.put("nt", ntList);
	return namedEntity;
}
 
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:20,代码来源:HanlpExtractor.java

示例2: testSegmentCorpus

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
public void testSegmentCorpus() throws Exception
{
    File root = new File("D:\\Doc\\语料库\\搜狗文本分类语料库精简版");
    for (File folder : root.listFiles())
    {
        if (folder.isDirectory())
        {
            for (File file : folder.listFiles())
            {
                System.out.println(file.getAbsolutePath());
                List<Term> termList = HanLP.segment(IOUtil.readTxt(file.getAbsolutePath()));
                StringBuilder sbOut = new StringBuilder();
                for (Term term : termList)
                {
                    if (CoreStopWordDictionary.shouldInclude(term))
                    {
                        sbOut.append(term.word).append(" ");
                    }
                }
                IOUtil.saveTxt("D:\\Doc\\语料库\\segmented\\" + folder.getName() + "_" + file.getName(), sbOut.toString());
            }
        }
    }
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:25,代码来源:TestLDA.java

示例3: convert

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
/**
 * 将分词结果转换为同义词列表
 * @param sentence 句子
 * @param withUndefinedItem 是否保留词典中没有的词语
 * @return
 */
public static List<CommonSynonymDictionary.SynonymItem> convert(List<Term> sentence, boolean withUndefinedItem)
{
    List<CommonSynonymDictionary.SynonymItem> synonymItemList = new ArrayList<CommonSynonymDictionary.SynonymItem>(sentence.size());
    for (Term term : sentence)
    {
        CommonSynonymDictionary.SynonymItem item = get(term.word);
        if (item == null)
        {
            if (withUndefinedItem)
            {
                item = CommonSynonymDictionary.SynonymItem.createUndefined(term.word);
                synonymItemList.add(item);
            }

        }
        else
        {
            synonymItemList.add(item);
        }
    }

    return synonymItemList;
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:30,代码来源:CoreSynonymDictionary.java

示例4: rewrite

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
public String rewrite(String text)
{
    List<Term> termList = StandardTokenizer.segment(text.toCharArray());
    StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
    String preWord = Predefine.TAG_BIGIN;
    for (Term term : termList)
    {
        SynonymItem synonymItem = get(term.word);
        Synonym synonym;
        if (synonymItem != null && (synonym = synonymItem.randomSynonym(Type.EQUAL, preWord)) != null)
        {
            sbOut.append(synonym.realWord);
        }
        else sbOut.append(term.word);
        preWord = PosTagCompiler.compile(term.nature.toString(), term.word);
    }
    return sbOut.toString();
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:19,代码来源:CommonSynonymDictionary.java

示例5: extractSuffixByWords

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
/**
 * 此方法认为后缀一定是整个的词语,所以length是以词语为单位的
 * @param length
 * @param size
 * @param extend
 * @return
 */
public List<String> extractSuffixByWords(int length, int size, boolean extend)
{
    TFDictionary suffixTreeSet = new TFDictionary();
    for (String key : tfDictionary.keySet())
    {
        List<Term> termList = StandardTokenizer.segment(key);
        if (termList.size() > length)
        {
            suffixTreeSet.add(combine(termList.subList(termList.size() - length, termList.size())));
            if (extend)
            {
                for (int l = 1; l < length; ++l)
                {
                    suffixTreeSet.add(combine(termList.subList(termList.size() - l, termList.size())));
                }
            }
        }
    }

    return extract(suffixTreeSet, size);
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:29,代码来源:CommonSuffixExtractor.java

示例6: toVertexList

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
private static List<Vertex> toVertexList(List<Term> termList, boolean appendStart)
{
    ArrayList<Vertex> vertexList = new ArrayList<Vertex>(termList.size() + 1);
    if (appendStart) vertexList.add(Vertex.B);
    for (Term term : termList)
    {
        CoreDictionary.Attribute attribute = CoreDictionary.get(term.word);
        if (attribute == null)
        {
            if (term.word.trim().length() == 0) attribute = new CoreDictionary.Attribute(Nature.x);
            else attribute = new CoreDictionary.Attribute(Nature.nz);
        }
        else term.nature = attribute.nature[0];
        Vertex vertex = new Vertex(term.word, attribute);
        vertexList.add(vertex);
    }

    return vertexList;
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:20,代码来源:CRFSegment.java

示例7: convertSentenceListToDocument

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
/**
 * 将句子列表转化为文档
 *
 * @param sentenceList
 * @return
 */
private static List<List<String>> convertSentenceListToDocument(List<String> sentenceList)
{
    List<List<String>> docs = new ArrayList<List<String>>(sentenceList.size());
    for (String sentence : sentenceList)
    {
        List<Term> termList = StandardTokenizer.segment(sentence.toCharArray());
        List<String> wordList = new LinkedList<String>();
        for (Term term : termList)
        {
            if (CoreStopWordDictionary.shouldInclude(term))
            {
                wordList.add(term.word);
            }
        }
        docs.add(wordList);
    }
    return docs;
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:25,代码来源:TextRankSentence.java

示例8: main

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
public static void main(String[] args)
{
    List<Term> termList = TraditionalChineseTokenizer.segment("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" +
                                                                      "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" +
                                                                      "突出外表、百變髮型及正面的形象,以至自己" +
                                                                      "品牌的男士香水等商品,及長期擔任運動品牌" +
                                                                      "Adidas的代言人,因此對大眾傳播媒介和時尚界" +
                                                                      "等方面都具很大的影響力,在足球圈外所獲得的" +
                                                                      "認受程度可謂前所未見。");
    System.out.println(termList);

    termList = TraditionalChineseTokenizer.segment("(中央社記者黃巧雯台北20日電)外需不振,影響接單動能,經濟部今天公布7月外銷訂單金額362.9億美元,年減5%," +
                                                           "連續4個月衰退,減幅較6月縮小。1040820\n");
    System.out.println(termList);

    termList = TraditionalChineseTokenizer.segment("中央社记者黄巧雯台北20日电");
    System.out.println(termList);
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:19,代码来源:DemoTraditionalChineseSegment.java

示例9: seg2sentence

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
/**
 * 切分为句子形式
 *
 * @param text
 * @return
 */
public static List<List<Term>> seg2sentence(String text)
{
    List<List<Term>> sentenceList = SEGMENT.seg2sentence(text);
    for (List<Term> sentence : sentenceList)
    {
        ListIterator<Term> listIterator = sentence.listIterator();
        while (listIterator.hasNext())
        {
            if (!CoreStopWordDictionary.shouldInclude(listIterator.next()))
            {
                listIterator.remove();
            }
        }
    }

    return sentenceList;
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:24,代码来源:NotionalTokenizer.java

示例10: segSentence

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
private static List<Term> segSentence(String text)
{
    String sText = CharTable.convert(text);
    List<Term> termList = SEGMENT.seg(sText);
    int offset = 0;
    for (Term term : termList)
    {
        String tText;
        term.offset = offset;
        if (term.length() == 1 || (tText = SimplifiedChineseDictionary.getTraditionalChinese(term.word)) == null)
        {
            term.word = text.substring(offset, offset + term.length());
            offset += term.length();
        }
        else
        {
            offset += term.length();
            term.word = tText;
        }
    }

    return termList;
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:24,代码来源:TraditionalChineseTokenizer.java

示例11: segment

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
/**
 * 分词
 * @param text 文本
 * @return 分词结果
 */
public static List<Term> segment(String text)
{
    List<Term> termList = new LinkedList<Term>();
    Matcher matcher = WEB_URL.matcher(text);
    int begin = 0;
    int end;
    while (matcher.find())
    {
        end = matcher.start();
        termList.addAll(SEGMENT.seg(text.substring(begin, end)));
        termList.add(new Term(matcher.group(), Nature.xu));
        begin = matcher.end();
    }
    if (begin < text.length()) termList.addAll(SEGMENT.seg(text.substring(begin)));

    return termList;
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:23,代码来源:URLTokenizer.java

示例12: main

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
public static void main(String[] args)
{
    String[] testCase = new String[]{
            "签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。",
            "区长庄木弟新年致辞",
            "朱立伦:两岸都希望共创双赢 习朱历史会晤在即",
            "陕西首富吴一坚被带走 与令计划妻子有交集",
            "据美国之音电台网站4月28日报道,8岁的凯瑟琳·克罗尔(凤甫娟)和很多华裔美国小朋友一样,小小年纪就开始学小提琴了。她的妈妈是位虎妈么?",
            "凯瑟琳和露西(庐瑞媛),跟她们的哥哥们有一些不同。",
            "王国强、高峰、汪洋、张朝阳光着头、韩寒、小四",
            "张浩和胡健康复员回家了",
            "王总和小丽结婚了",
            "编剧邵钧林和稽道青说",
            "这里有关天培的有关事迹",
            "龚学平等领导说,邓颖超生前杜绝超生",
    };
    Segment segment = HanLP.newSegment().enableNameRecognize(true);
    for (String sentence : testCase)
    {
        List<Term> termList = segment.seg(sentence);
        System.out.println(termList);
    }
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:24,代码来源:DemoChineseNameRecognition.java

示例13: segment

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
/**
 * 中文分词
 *
 * @param sentence 待分词文本
 * @return 分词结果列表
 */
public static List<String> segment(String sentence) {
    List<String> results = new ArrayList<>();
    // HanLP
    List<Term> termList = HanLP.segment(sentence);
    results.addAll(termList
            .stream()
            .map(term -> term.word)
            .collect(Collectors.toList())
    );
    return results;
}
 
开发者ID:shibing624,项目名称:phrase-search,代码行数:18,代码来源:Tokenizer.java

示例14: testSeg

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
public void testSeg() throws Exception
{
    String text = "商品与服务";
    DijkstraSegment segment = new DijkstraSegment();
    List<Term> resultList = segment.seg(text);
    System.out.println(resultList);
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:8,代码来源:TestDijkstra.java

示例15: testIssue22

import com.hankcs.hanlp.seg.common.Term; //导入依赖的package包/类
public void testIssue22() throws Exception
{
    CoreDictionary.Attribute attribute = CoreDictionary.get("年");
    System.out.println(attribute);
    List<Term> termList = StandardTokenizer.segment("三年");
    System.out.println(termList);
    assertEquals(attribute.nature[0], termList.get(1).nature);
    System.out.println(StandardTokenizer.segment("三元"));
    StandardTokenizer.SEGMENT.enableNumberQuantifierRecognize(true);
    System.out.println(StandardTokenizer.segment("三年"));
}
 
开发者ID:priester,项目名称:hanlpStudy,代码行数:12,代码来源:TestSegment.java


注:本文中的com.hankcs.hanlp.seg.common.Term类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。