当前位置: 首页>>代码示例>>Java>>正文


Java ToAnalysis类代码示例

本文整理汇总了Java中org.ansj.splitWord.analysis.ToAnalysis的典型用法代码示例。如果您正苦于以下问题:Java ToAnalysis类的具体用法?Java ToAnalysis怎么用?Java ToAnalysis使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


ToAnalysis类属于org.ansj.splitWord.analysis包,在下文中一共展示了ToAnalysis类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
	System.out.println(parse);
	List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");
	
  
	//System.out.println(parse1);
	String text11="ZW321282050000000325";
	
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
	while (tokenizer.incrementToken()){

	      System.out.print(new String(termAtt.toString()+" ") );
		//  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		//System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
开发者ID:dimensoft,项目名称:improved-journey,代码行数:27,代码来源:TestAnsj.java

示例2: checkTextContent

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public int checkTextContent(int userId, String content) throws IOException {
    HashSet<String> sensitiveWords = new HashSet<String>();
    InputStream fis = new FileInputStream(source);
    InputStreamReader isr = new InputStreamReader(fis, Charset.forName("UTF-8"));
    BufferedReader br = new BufferedReader(isr);
    String line;
    while ((line = br.readLine()) != null)
        sensitiveWords.add(line.substring(0, line.length() - 1));


    Result result = ToAnalysis.parse(Jsoup.clean(content, Whitelist.none()));
    List<Term> termList = result.getTerms();
    for (Term term : termList) {
        if (sensitiveWords.contains(term.getName()))
            return 0;
    }
    return 1;
}
 
开发者ID:qinjr,项目名称:TeamNote,代码行数:19,代码来源:QualityUtilImpl.java

示例3: doPost

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void doPost(HttpServletRequest request, HttpServletResponse response) 
		throws ServletException, IOException {
	
	request.setCharacterEncoding("UTF-8");
	response.setCharacterEncoding("UTF-8");
	
	// 必填参数
	String Text = request.getParameter("text");
	
	List<Term> terms = ToAnalysis.parse(Text);
	
	PrintWriter out = response.getWriter();
	out.print("分词结果为:\n" + terms);
	out.flush();
	out.close();
}
 
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:17,代码来源:Ansj_seg.java

示例4: tokenizeDocxFile

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private static void tokenizeDocxFile(String filePath) {
    File file = new File(filePath);
    DocumentInfo doc = DocumentParser.parseFileToDocumentInfo(file);
    if(doc instanceof WordDocumentInfo) {
        String content = ((WordDocumentInfo) doc).getDocStr();
        Result terms = ToAnalysis.parse(content);
        for (int i = 0; i < terms.size(); i++) {
            String words = terms.get(i).getName();
            boolean filtered = false;
            for(String stopToken : stopTokens)
                if(words.equals(stopToken)) { filtered = true; break; }
            char firstLetter = words.charAt(0);
            if((firstLetter >= 'A' && firstLetter <= 'Z') ||
                    (firstLetter >= 'a' && firstLetter <= 'z') ||
                    (firstLetter >= '0' && firstLetter <= '9'))
                filtered = true;
            if(filtered) continue;
            wordsCN.add(words);
        }
    }
    else System.out.println("Not a docx file");
}
 
开发者ID:linzeqipku,项目名称:SnowGraph,代码行数:23,代码来源:WordSegmenter.java

示例5: findEntities

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public Entities findEntities(String sentence, boolean allowDuplicated) {
    Entities entities = new Entities(allowDuplicated);

    Result result = ToAnalysis.parse(sentence);
    for (Term term : result.getTerms()) {
        if (term.getName().length() < 2) {
            continue;
        }
        if (term.getNatureStr().startsWith("nr")) {
            entities.addPerson(term.getName());
        } else if (term.getNatureStr().startsWith("nt")) {
            entities.addOrganization(term.getName());
        } else if (term.getNatureStr().startsWith("ns")) {
            if (term.getName().endsWith("大学") || term.getName().endsWith("学院")) {
                entities.addOrganization(term.getName());
            } else {
                entities.addSpace(term.getName());
            }
        }
    }
    return entities;
}
 
开发者ID:iamxiatian,项目名称:wikit,代码行数:24,代码来源:AnsjSegment.java

示例6: main

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
开发者ID:lgnlgn,项目名称:ansj4solr,代码行数:21,代码来源:TestAnsj.java

示例7: testAnsjAnalyzer

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Test
public void testAnsjAnalyzer() throws IOException {
	String input = "我在首都机场虹桥路滑旱冰!玩的很Happy,很Hi!";
	
	System.out.print("Ansj索引时采用面向索引的分词:" + input);
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36),input);
	System.out.print("索引分词时过滤停用词");
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,null,stopwords),input);
	
	System.out.println("------------------------------------------------------------");
	
	System.out.print("Ansj查询时采用精准分词:" + input);
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class),input);
	System.out.print("查询分词时过滤停用词");
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class,stopwords),input);
}
 
开发者ID:flash0729,项目名称:ansj-seg-for-lucene3,代码行数:17,代码来源:AnsjAnalysisTest.java

示例8: tag

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<SegWord> tag(String sentence) throws SegmentException {
    Result result = ToAnalysis.parse(sentence);
    List<SegWord> results = new ArrayList<SegWord>();

    for (Term term : result.getTerms()) {
        results.add(new SegWord(term.getName(), term.getNatureStr()));
    }

    return results;
}
 
开发者ID:iamxiatian,项目名称:wikit,代码行数:12,代码来源:AnsjSegment.java

示例9: demo

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static ArrayList<String> demo(String strToParse) {
    String str = strToParse;
            //"我年纪还轻,阅历不深的时候,我父亲教导过我一句话,我至今还念念不忘。 \n" +
            //"“每逢你想要批评任何人的时候,”他对我说,“你就记住,这个世界上所有的人,并不是个个都有过你拥有的那些优越的条件。”";
    ArrayList<String> ret = new ArrayList<>();
    Result terms = ToAnalysis.parse(str);
    for (int i = 0; i < terms.size(); i++) {
        String words = terms.get(i).getName();// 获取单词
        String nominal = terms.get(i).getNatureStr();// 获取词性
        ret.add(words);
        //System.out.print(words + "\t" + nominal + "\n");
    }
    return ret;
}
 
开发者ID:linzeqipku,项目名称:SnowGraph,代码行数:15,代码来源:WordSegmenter.java

示例10: Seg

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
 * 分词
 * @param sentence 待分词的句子
 * @return 分词结果
 */
public static List<Term> Seg(String sentence) {
    FilterRecognition filter = new FilterRecognition();
    //过滤标点符号
    filter.insertStopWord(",", " ", ".", ",", "。", ":", ":", "'", "‘", "’", " ", "“", "”", "《", "》", "[", "]", "-");
    return ToAnalysis.parse(sentence).recognition(filter).getTerms();
}
 
开发者ID:jsksxs360,项目名称:Word2Vec,代码行数:12,代码来源:Segment.java

示例11: realySplit

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private ArrayList<String> realySplit(String strbuf) {
    List<Term> parse = ToAnalysis.parse(strbuf);
    ArrayList<String> words = new ArrayList<>();
    for (Term term : parse) {
        if (StringUtils.RegexUtils.hasChinese(term.toString())) {
            words.add(term.getName());
        }
    }
    
    return words;
}
 
开发者ID:William-Hai,项目名称:CorpusSpider,代码行数:12,代码来源:AnjsSplitWordsUtils.java

示例12: containKeyword

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static boolean containKeyword(String text,String keyword) throws Exception{
	 boolean flag=false;
	 List<Term> tokens=ToAnalysis.parse(keyword);
	 for(Term t:tokens){
			String token=t.getName();
			if(text.contains(token)){
				flag=true;
				break;
			}
	}		
	 return flag;
}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:13,代码来源:HtmlContentExtractor.java

示例13: segment

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<String> segment(String sentence) throws SegmentException {
    List<String> results = new ArrayList<String>();
    Result result = ToAnalysis.parse(sentence);
    for (Term term : result.getTerms()) {
        results.add(term.getName());
    }
    return results;
}
 
开发者ID:iamxiatian,项目名称:wikit,代码行数:10,代码来源:AnsjSegment.java

示例14: tokenizeTerm

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public HashMap<String, TermScore> tokenizeTerm(String input_str) {
	
	//long startt = System.nanoTime(); 
	tokens = ToAnalysis.parse(input_str);
	token_iterator = tokens.listIterator();
	
	HashMap<String, TermScore> hash = new HashMap<String, TermScore>();
	while(token_iterator.hasNext())
	{
		Term term = token_iterator.next();
		if(term.getName().length()>=2)
		{
			if(hash.get(term.getName()) == null)
				hash.put(term.getName(), new TermScore(term.getName(), 0));
			else
			{
				TermScore exist_term = hash.get(term.getName());
				int new_score = exist_term.getScore()+1;
				exist_term.setScore(new_score);
				hash.put(term.getName(), exist_term);
			}
		}
	}
	//long endd = System.nanoTime(); 
	//System.out.println("Tokenization costs: " + (endd - startt ) + " ns"); 
	
	return hash;
}
 
开发者ID:LunarBaseEngin,项目名称:LunarBase,代码行数:30,代码来源:TokenizerForSearchEngine.java

示例15: guessNature

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
 * 通过规则 猜测词性
 * 
 * @param word
 * @return
 */
public static TermNatures guessNature(String word) {
    String nature = null;
    SmartForest<String[]> smartForest = SUFFIX_FOREST;
    int len = 0;
    for (int i = word.length() - 1; i >= 0; i--) {
        smartForest = smartForest.get(word.charAt(i));
        if (smartForest == null) {
            break;
        }
        len++;
        if (smartForest.getStatus() == 2) {
            nature = smartForest.getParam()[0];
        } else if (smartForest.getStatus() == 3) {
            nature = smartForest.getParam()[0];
            break;
        }
    }

    if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
        return TermNatures.NT;
    } else if ("ns".equals(nature)) {
        return TermNatures.NS;
    } else if (word.length() < 5) {
        Result parse = ToAnalysis.parse(word);
        for (Term term : parse.getTerms()) {
            if ("nr".equals(term.getNatureStr())) {
                return TermNatures.NR;
            }
        }
    } else if (ForeignPersonRecognition.isFName(word)) {
        return TermNatures.NRF;
    }

    return TermNatures.NW;
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:42,代码来源:NatureRecognition.java


注:本文中的org.ansj.splitWord.analysis.ToAnalysis类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。