本文整理汇总了Java中org.ansj.splitWord.analysis.ToAnalysis类的典型用法代码示例。如果您正苦于以下问题:Java ToAnalysis类的具体用法?Java ToAnalysis怎么用?Java ToAnalysis使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ToAnalysis类属于org.ansj.splitWord.analysis包,在下文中一共展示了ToAnalysis类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
System.out.println(parse);
List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");
//System.out.println(parse1);
String text11="ZW321282050000000325";
Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt =
tokenizer.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAtt =
tokenizer.addAttribute(PositionIncrementAttribute.class);
tokenizer.reset();
while (tokenizer.incrementToken()){
System.out.print(new String(termAtt.toString()+" ") );
// System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
//System.out.print( positionIncrementAtt.getPositionIncrement() +"/");
}
tokenizer.close();
}
示例2: checkTextContent
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public int checkTextContent(int userId, String content) throws IOException {
HashSet<String> sensitiveWords = new HashSet<String>();
InputStream fis = new FileInputStream(source);
InputStreamReader isr = new InputStreamReader(fis, Charset.forName("UTF-8"));
BufferedReader br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null)
sensitiveWords.add(line.substring(0, line.length() - 1));
Result result = ToAnalysis.parse(Jsoup.clean(content, Whitelist.none()));
List<Term> termList = result.getTerms();
for (Term term : termList) {
if (sensitiveWords.contains(term.getName()))
return 0;
}
return 1;
}
示例3: doPost
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
request.setCharacterEncoding("UTF-8");
response.setCharacterEncoding("UTF-8");
// 必填参数
String Text = request.getParameter("text");
List<Term> terms = ToAnalysis.parse(Text);
PrintWriter out = response.getWriter();
out.print("分词结果为:\n" + terms);
out.flush();
out.close();
}
示例4: tokenizeDocxFile
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private static void tokenizeDocxFile(String filePath) {
File file = new File(filePath);
DocumentInfo doc = DocumentParser.parseFileToDocumentInfo(file);
if(doc instanceof WordDocumentInfo) {
String content = ((WordDocumentInfo) doc).getDocStr();
Result terms = ToAnalysis.parse(content);
for (int i = 0; i < terms.size(); i++) {
String words = terms.get(i).getName();
boolean filtered = false;
for(String stopToken : stopTokens)
if(words.equals(stopToken)) { filtered = true; break; }
char firstLetter = words.charAt(0);
if((firstLetter >= 'A' && firstLetter <= 'Z') ||
(firstLetter >= 'a' && firstLetter <= 'z') ||
(firstLetter >= '0' && firstLetter <= '9'))
filtered = true;
if(filtered) continue;
wordsCN.add(words);
}
}
else System.out.println("Not a docx file");
}
示例5: findEntities
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public Entities findEntities(String sentence, boolean allowDuplicated) {
Entities entities = new Entities(allowDuplicated);
Result result = ToAnalysis.parse(sentence);
for (Term term : result.getTerms()) {
if (term.getName().length() < 2) {
continue;
}
if (term.getNatureStr().startsWith("nr")) {
entities.addPerson(term.getName());
} else if (term.getNatureStr().startsWith("nt")) {
entities.addOrganization(term.getName());
} else if (term.getNatureStr().startsWith("ns")) {
if (term.getName().endsWith("大学") || term.getName().endsWith("学院")) {
entities.addOrganization(term.getName());
} else {
entities.addSpace(term.getName());
}
}
}
return entities;
}
示例6: main
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
System.out.println(parse);
Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt =
tokenizer.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAtt =
tokenizer.addAttribute(PositionIncrementAttribute.class);
while (tokenizer.incrementToken()){
System.out.print(new String(termAtt.toString()) );
System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
System.out.print( positionIncrementAtt.getPositionIncrement() +"/");
}
tokenizer.close();
}
示例7: testAnsjAnalyzer
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Test
public void testAnsjAnalyzer() throws IOException {
String input = "我在首都机场虹桥路滑旱冰!玩的很Happy,很Hi!";
System.out.print("Ansj索引时采用面向索引的分词:" + input);
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36),input);
System.out.print("索引分词时过滤停用词");
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,null,stopwords),input);
System.out.println("------------------------------------------------------------");
System.out.print("Ansj查询时采用精准分词:" + input);
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class),input);
System.out.print("查询分词时过滤停用词");
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class,stopwords),input);
}
示例8: tag
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<SegWord> tag(String sentence) throws SegmentException {
Result result = ToAnalysis.parse(sentence);
List<SegWord> results = new ArrayList<SegWord>();
for (Term term : result.getTerms()) {
results.add(new SegWord(term.getName(), term.getNatureStr()));
}
return results;
}
示例9: demo
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static ArrayList<String> demo(String strToParse) {
String str = strToParse;
//"我年纪还轻,阅历不深的时候,我父亲教导过我一句话,我至今还念念不忘。 \n" +
//"“每逢你想要批评任何人的时候,”他对我说,“你就记住,这个世界上所有的人,并不是个个都有过你拥有的那些优越的条件。”";
ArrayList<String> ret = new ArrayList<>();
Result terms = ToAnalysis.parse(str);
for (int i = 0; i < terms.size(); i++) {
String words = terms.get(i).getName();// 获取单词
String nominal = terms.get(i).getNatureStr();// 获取词性
ret.add(words);
//System.out.print(words + "\t" + nominal + "\n");
}
return ret;
}
示例10: Seg
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
* 分词
* @param sentence 待分词的句子
* @return 分词结果
*/
public static List<Term> Seg(String sentence) {
FilterRecognition filter = new FilterRecognition();
//过滤标点符号
filter.insertStopWord(",", " ", ".", ",", "。", ":", ":", "'", "‘", "’", " ", "“", "”", "《", "》", "[", "]", "-");
return ToAnalysis.parse(sentence).recognition(filter).getTerms();
}
示例11: realySplit
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private ArrayList<String> realySplit(String strbuf) {
List<Term> parse = ToAnalysis.parse(strbuf);
ArrayList<String> words = new ArrayList<>();
for (Term term : parse) {
if (StringUtils.RegexUtils.hasChinese(term.toString())) {
words.add(term.getName());
}
}
return words;
}
示例12: containKeyword
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static boolean containKeyword(String text,String keyword) throws Exception{
boolean flag=false;
List<Term> tokens=ToAnalysis.parse(keyword);
for(Term t:tokens){
String token=t.getName();
if(text.contains(token)){
flag=true;
break;
}
}
return flag;
}
示例13: segment
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<String> segment(String sentence) throws SegmentException {
List<String> results = new ArrayList<String>();
Result result = ToAnalysis.parse(sentence);
for (Term term : result.getTerms()) {
results.add(term.getName());
}
return results;
}
示例14: tokenizeTerm
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public HashMap<String, TermScore> tokenizeTerm(String input_str) {
//long startt = System.nanoTime();
tokens = ToAnalysis.parse(input_str);
token_iterator = tokens.listIterator();
HashMap<String, TermScore> hash = new HashMap<String, TermScore>();
while(token_iterator.hasNext())
{
Term term = token_iterator.next();
if(term.getName().length()>=2)
{
if(hash.get(term.getName()) == null)
hash.put(term.getName(), new TermScore(term.getName(), 0));
else
{
TermScore exist_term = hash.get(term.getName());
int new_score = exist_term.getScore()+1;
exist_term.setScore(new_score);
hash.put(term.getName(), exist_term);
}
}
}
//long endd = System.nanoTime();
//System.out.println("Tokenization costs: " + (endd - startt ) + " ns");
return hash;
}
示例15: guessNature
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
* 通过规则 猜测词性
*
* @param word
* @return
*/
public static TermNatures guessNature(String word) {
String nature = null;
SmartForest<String[]> smartForest = SUFFIX_FOREST;
int len = 0;
for (int i = word.length() - 1; i >= 0; i--) {
smartForest = smartForest.get(word.charAt(i));
if (smartForest == null) {
break;
}
len++;
if (smartForest.getStatus() == 2) {
nature = smartForest.getParam()[0];
} else if (smartForest.getStatus() == 3) {
nature = smartForest.getParam()[0];
break;
}
}
if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
return TermNatures.NT;
} else if ("ns".equals(nature)) {
return TermNatures.NS;
} else if (word.length() < 5) {
Result parse = ToAnalysis.parse(word);
for (Term term : parse.getTerms()) {
if ("nr".equals(term.getNatureStr())) {
return TermNatures.NR;
}
}
} else if (ForeignPersonRecognition.isFName(word)) {
return TermNatures.NRF;
}
return TermNatures.NW;
}