本文整理匯總了Java中org.apache.lucene.analysis.tokenattributes.CharTermAttribute.toString方法的典型用法代碼示例。如果您正苦於以下問題:Java CharTermAttribute.toString方法的具體用法?Java CharTermAttribute.toString怎麽用?Java CharTermAttribute.toString使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.lucene.analysis.tokenattributes.CharTermAttribute
的用法示例。
在下文中一共展示了CharTermAttribute.toString方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: analyze
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
}
}));
IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
AllEntries allEntries = new AllEntries();
allEntries.addText("field1", text, 1.0f);
TokenStream stream = AllTokenStream.allTokenStream("_all", text, 1.0f, analyzer);
stream.reset();
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
List<String> terms = new ArrayList<>();
while (stream.incrementToken()) {
String tokText = termAtt.toString();
terms.add(tokText);
}
return terms;
}
示例2: stemHinglish
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
public static void stemHinglish(CharTermAttribute termAtt)
{
char [] buffer = termAtt.buffer();
String strInput = new String(termAtt.toString());
//System.out.println("Before " + strInput + " " + termAtt.toString());
Iterator itr = lsRegexs.iterator();
while (itr.hasNext())
{
List<Object> lsInputs = (List<Object>)itr.next();
Matcher matcher = ((Pattern)lsInputs.get(0)).matcher(strInput);
if (matcher.matches())
{
Matcher replMatcher = ((Pattern)lsInputs.get(1)).matcher(strInput);
strInput = replMatcher.replaceAll((String)lsInputs.get(2));
}
}
//strInput = strInput.trim();
for (int iCounter = 0; iCounter < strInput.length(); iCounter++)
{
buffer[iCounter] = strInput.charAt(iCounter);
}
termAtt.setLength(strInput.length());
//System.out.println("After " + strInput + " " + termAtt.toString());
}
示例3: removeStopWords
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
private String removeStopWords(String action) {
StringBuilder builder = new StringBuilder();
try {
FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
TokenStream tokenStream = frenchAnalyzer.tokenStream("contents", action);
CharTermAttribute attribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
String term = attribute.toString();
builder.append(term + " ");
}
} catch (IOException e) {
e.printStackTrace();
}
return builder.toString();
}
示例4: parseTokens
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
/**
* Parses the query. Using this instead of a QueryParser in order
* to avoid thread-safety issues with Lucene's query parser.
*
* @param fieldName the name of the field
* @param value the value of the field
* @return the parsed query
*/
private Query parseTokens(String fieldName, String value) {
BooleanQuery searchQuery = new BooleanQuery();
if (value != null) {
Analyzer analyzer = new KeywordAnalyzer();
try {
TokenStream tokenStream =
analyzer.tokenStream(fieldName, new StringReader(value));
tokenStream.reset();
CharTermAttribute attr =
tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
String term = attr.toString();
Query termQuery = new TermQuery(new Term(fieldName, term));
searchQuery.add(termQuery, Occur.SHOULD);
}
} catch (IOException e) {
throw new DukeException("Error parsing input string '" + value + "' " +
"in field " + fieldName);
}
}
return searchQuery;
}
示例5: testBulk
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
@Test
public void testBulk() throws IOException {
String str = "";
str = "SK, 하이닉스";
//str = "하이닉스";
StringReader input = new StringReader(str);
CSVAnalyzer analyzer = new CSVAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("", input);
tokenStream.reset();
logger.debug("tokenStream:{}", tokenStream);
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
for(int inx=0;tokenStream.incrementToken();inx++) {
String term = charTermAttribute.toString();
logger.debug("[{}] \"{}\" {}~{}", inx, term, offsetAttribute.startOffset(), offsetAttribute.endOffset());
}
analyzer.close();
}
示例6: displayTokens
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
protected void displayTokens(String text, String elementId) throws IOException {
if (log.isDebugEnabled()) {
Analyzer analyzer = getConfiguredAnalyzer();
StringBuilder sb = new StringBuilder();
sb.append(elementId).append(": ").append(text).append(": ");
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
}
log.debug(sb);
}
}
示例7: getTermFreq
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) {
TokenStream ts = analyzer.tokenStream(termField, text);
CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
Map<String, Integer> termFreq = new HashMap<>();
try {
ts.reset();
while (ts.incrementToken()) {
String term = cattr.toString();
int cnt = termFreq.getOrDefault(
FeatureExtractorUtilities.composeKey(termField, term), 0);
termFreq.put(term, cnt + 1);
}
ts.end();
ts.close();
} catch (IOException e) {
logger.error("{}", e.getMessage());
throw new BadRequestException(e);
}
return termFreq;
}
示例8: walkTerms
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
void walkTerms(TokenStream ts, String op, String[] terms, String[] tags) throws IOException {
int i = 0;
while (ts.incrementToken()) {
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
String word = termAtt.toString();
if (terms != null) {
assertEquals(terms[i], word);
}
if (tags != null) {
if (tags[i] != null) {
PayloadAttribute p = ts.getAttribute(PayloadAttribute.class);
BytesRef payload = p.getPayload();
//Arrays.copyOfRange(payload.bytes, payload.offset, payload.offset + payload.length);
byte[] data = payload.bytes;
assertEquals(tags[i], (data != null) ? new String(data, "UTF-8") : null);
}
}
i++;
}
if (terms != null) {
assertEquals(terms.length, i);
}
}
示例9: analyze
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
public static ArrayList<String> analyze(final String content) {
try {
ArrayList<String> _xblockexpression = null;
{
final IKAnalyzer ikAnalyzer = new IKAnalyzer(true);
final TokenStream ts = ikAnalyzer.tokenStream("field", content);
final CharTermAttribute ch = ts.<CharTermAttribute>addAttribute(CharTermAttribute.class);
ts.reset();
final ArrayList<String> words = CollectionLiterals.<String>newArrayList();
while (ts.incrementToken()) {
String _string = ch.toString();
words.add(_string);
}
ts.end();
ts.close();
_xblockexpression = words;
}
return _xblockexpression;
} catch (Throwable _e) {
throw Exceptions.sneakyThrow(_e);
}
}
示例10: main
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
public static void main(final String[] args) {
try {
final IKAnalyzer ikAnalyzer = new IKAnalyzer(true);
final String text = "lucene分析器使用分詞器和過濾器構成一個“管道”,文本在流經這個管道後成為可以進入索引的最小單位,因此,一個標準的分析器有兩個部分組成,一個是分詞器tokenizer,它用於將文本按照規則切分為一個個可以進入索引的最小單位。另外一個是TokenFilter,它主要作用是對切出來的詞進行進一步的處理(如去掉敏感詞、英文大小寫轉換、單複數處理)等。lucene中的Tokenstram方法首先創建一個tokenizer對象處理Reader對象中的流式文本,然後利用TokenFilter對輸出流進行過濾處理";
final TokenStream ts = ikAnalyzer.tokenStream("field", text);
final CharTermAttribute ch = ts.<CharTermAttribute>addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String _string = ch.toString();
String _plus = (_string + " | ");
InputOutput.<String>print(_plus);
}
ts.end();
ts.close();
} catch (Throwable _e) {
throw Exceptions.sneakyThrow(_e);
}
}
示例11: testShingleAnalyzerWrapperPhraseQuery
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
PhraseQuery q = new PhraseQuery();
TokenStream ts = analyzer.tokenStream("content", "this sentence");
try {
int j = -1;
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
j += posIncrAtt.getPositionIncrement();
String termText = termAtt.toString();
q.add(new Term("content", termText), j);
}
ts.end();
} finally {
IOUtils.closeWhileHandlingException(ts);
}
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
int[] ranks = new int[] { 0 };
compareRanks(hits, ranks);
}
示例12: testShingleAnalyzerWrapperBooleanQuery
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
BooleanQuery q = new BooleanQuery();
TokenStream ts = analyzer.tokenStream("content", "test sentence");
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String termText = termAtt.toString();
q.add(new TermQuery(new Term("content", termText)),
BooleanClause.Occur.SHOULD);
}
ts.end();
} finally {
IOUtils.closeWhileHandlingException(ts);
}
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
int[] ranks = new int[] { 1, 2, 0 };
compareRanks(hits, ranks);
}
示例13: addTermFrequencies
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
/**
* Adds term frequencies found by tokenizing text from reader into the Map words
*
* @param r a source of text to be tokenized
* @param termFreqMap a Map of terms and their frequencies
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
throws IOException {
if (analyzer == null) {
throw new UnsupportedOperationException("To use MoreLikeThis without " +
"term vectors, you must provide an Analyzer");
}
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
int tokenCount = 0;
// for every token
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String word = termAtt.toString();
tokenCount++;
if (tokenCount > maxNumTokensParsed) {
break;
}
if (isNoiseWord(word)) {
continue;
}
if (isSkipTerm(fieldName, word)) {
continue;
}
// increment frequency
Int cnt = termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Int());
} else {
cnt.x++;
}
}
ts.end();
}
}
示例14: incrementToken
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
CharTermAttribute termAtt = this.getAttribute(CharTermAttribute.class);
final String term = termAtt.toString();
termAtt.setEmpty();
PayloadAttribute payloadAtt = this.getAttribute(PayloadAttribute.class);
final BytesRef payload = payloadAtt.getPayload();
if(payload == null) {
return true;
}
float payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
if(payloadValue == 0.0f){
return true;
}
String weight = Float.toString(payloadValue);
// set weights to zero if in scientific notation
if(weight.contains("E-")){
return true;
}
String boostedTerm = term + "^" + weight;
termAtt.append(boostedTerm);
return true;
}
return false;
}
示例15: createDocVector
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; //導入方法依賴的package包/類
/**
* Used to create a DocVector from given String text. Used during the parse stage of the crawl
* cycle to create a DocVector of the currently parsed page from the parseText attribute value
* @param content
*/
public static DocVector createDocVector(String content) {
LuceneTokenizer tokenizer;
if(stopWords!=null) {
tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, stopWords, true,
StemFilterType.PORTERSTEM_FILTER);
}
else {
tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, true,
StemFilterType.PORTERSTEM_FILTER);
}
TokenStream tStream = tokenizer.getTokenStream();
HashMap<String, Integer> termVector = new HashMap<>();
try {
CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
tStream.reset();
while(tStream.incrementToken()) {
String term = charTermAttribute.toString();
if(termVector.containsKey(term)) {
int count = termVector.get(term);
count++;
termVector.put(term, count);
}
else {
termVector.put(term, 1);
}
}
DocVector docVector = new DocVector();
docVector.setTermFreqVector(termVector);
return docVector;
} catch (IOException e) {
LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e));
}
return null;
}