当前位置: 首页>>代码示例>>Java>>正文


Java OffsetAttribute.startOffset方法代码示例

本文整理汇总了Java中org.apache.lucene.analysis.tokenattributes.OffsetAttribute.startOffset方法的典型用法代码示例。如果您正苦于以下问题:Java OffsetAttribute.startOffset方法的具体用法?Java OffsetAttribute.startOffset怎么用?Java OffsetAttribute.startOffset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.analysis.tokenattributes.OffsetAttribute的用法示例。


在下文中一共展示了OffsetAttribute.startOffset方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: assertOffsets

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
static private void assertOffsets(String inputStr, TokenStream tokenStream, List<String> expected) {
    try {
        List<String> termList = new ArrayList<String>();
        // CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
        while (tokenStream.incrementToken()) {
            int start = offsetAttr.startOffset();
            int end = offsetAttr.endOffset();
            termList.add(inputStr.substring(start, end));
        }
        System.out.println(String.join(" ", termList));
        assertThat(termList, is(expected));
    } catch (IOException e) {
        assertTrue(false);
    }
}
 
开发者ID:BuddhistDigitalResourceCenter,项目名称:lucene-bo,代码行数:17,代码来源:TibetanAnalyzerTest.java

示例2: displayTokens

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
protected void displayTokens(String text, String elementId) throws IOException {
	if (log.isDebugEnabled()) {
		Analyzer analyzer = getConfiguredAnalyzer();
		StringBuilder sb = new StringBuilder();
		sb.append(elementId).append(": ").append(text).append(": ");

		TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
		CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
		OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);

		tokenStream.reset();
		while (tokenStream.incrementToken()) {
			int startOffset = offsetAttribute.startOffset();
			int endOffset = offsetAttribute.endOffset();
			String term = charTermAttribute.toString();
			sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
		}

		log.debug(sb);
	}
}
 
开发者ID:TIBCOSoftware,项目名称:jasperreports,代码行数:22,代码来源:LuceneUtil.java

示例3: walkTokens

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
private String[] walkTokens() throws IOException {
    List<String> wordList = new ArrayList<>();
    while (input.incrementToken()) {
        CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
        char[] buffer = textAtt.buffer();
        String word = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
        wordList.add(word);
        AttributeSource attrs = input.cloneAttributes();
        tokenAttrs.add(attrs);
    }
    String[] words = new String[wordList.size()];
    for (int i = 0; i < words.length; i++) {
        words[i] = wordList.get(i);
    }
    return words;
}
 
开发者ID:jprante,项目名称:elasticsearch-analysis-opennlp,代码行数:18,代码来源:OpenNLPTokenFilter.java

示例4: handleTokenStream

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
private void handleTokenStream(Map<Integer, List<Token>> tokenPosMap, TokenStream tokenStream) throws IOException {
    tokenStream.reset();
    int pos = 0;

    CharTermAttribute charTermAttribute = getCharTermAttribute(tokenStream);
    OffsetAttribute offsetAttribute = getOffsetAttribute(tokenStream);
    TypeAttribute typeAttribute = getTypeAttribute(tokenStream);
    PositionIncrementAttribute positionIncrementAttribute = getPositionIncrementAttribute(tokenStream);

    while (tokenStream.incrementToken()) {
        if (null == charTermAttribute || null == offsetAttribute) {
            return;
        }
        Token token = new Token(charTermAttribute.buffer(), 0, charTermAttribute.length(),
                offsetAttribute.startOffset(), offsetAttribute.endOffset());
        if (null != typeAttribute) {
            token.setType(typeAttribute.type());
        }
        pos += null != positionIncrementAttribute ? positionIncrementAttribute.getPositionIncrement() : 1;
        if (!tokenPosMap.containsKey(pos)) {
            tokenPosMap.put(pos, new LinkedList<Token>());
        }
        tokenPosMap.get(pos).add(token);
    }
    tokenStream.close();
}
 
开发者ID:smalldirector,项目名称:solr-multilingual-analyzer,代码行数:27,代码来源:MultiLangTokenizer.java

示例5: searchSingleWord

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
/**
 * Searches for a single word and updates the ranges.
 * 
 * @param tokenizer - The Lucene Tokenizer with the complete text of chapter.
 * @param searchString - The word to search for.
 * @param currentChapter - The chapter to search in.
 * @param ranges - The ranges of the found words will be added here.
 * @param documentLength - The length of the whole document.
 * @throws IOException
 */
private void searchSingleWord(Tokenizer tokenizer, String searchString, Chapter currentChapter,
    List<Range> ranges, int documentLength) throws IOException {
  // will be incremented
  CharTermAttribute charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class);
  OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);

  tokenizer.reset();
  while (tokenizer.incrementToken()) {
    if (charTermAttrib.toString().toLowerCase().matches(searchString.toLowerCase())) {
      int startOffset = offset.startOffset() + currentChapter.getRange().getStart().getOffset();
      int endOffset = offset.endOffset() + currentChapter.getRange().getStart().getOffset();

      ranges.add(new Range(TextPosition.fromGlobalOffset(startOffset, documentLength),
          TextPosition.fromGlobalOffset(endOffset, documentLength)));
    }
  }
}
 
开发者ID:vita-us,项目名称:ViTA,代码行数:28,代码来源:Searcher.java

示例6: compare

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
@Override
public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {

  int lenA = offsetA.endOffset() - offsetA.startOffset();
  int lenB = offsetB.endOffset() - offsetB.startOffset();
  if (lenA < lenB) {
    return 1;
  } else if (lenA > lenB) {
    return -1;
    // by here, the length is the same
  } else if (offsetA.startOffset() < offsetB.startOffset()) {
    return -1;
  } else if (offsetA.startOffset() > offsetB.startOffset()) {
    return 1;
  }
  return 0;
}
 
开发者ID:tballison,项目名称:lucene-addons,代码行数:18,代码来源:OffsetLengthStartComparator.java

示例7: removeOverlapsAndSort

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
/**
 * @param offsets         offsets to process
 * @param comparator      initial OffsetLengthStartComparator to use to rule out overlaps
 * @param startComparator comparator for final sort
 * @return sorted list of offsets
 */
public static List<OffsetAttribute> removeOverlapsAndSort(
    List<OffsetAttribute> offsets, OffsetLengthStartComparator comparator,
    OffsetStartComparator startComparator) {
  if (offsets == null || offsets.size() < 2)
    return offsets;

  Collections.sort(offsets, comparator);
  Set<Integer> seen = new HashSet<>();
  List<OffsetAttribute> filtered = new ArrayList<>();
  for (OffsetAttribute offset : offsets) {
    if (!alreadySeen(offset, seen)) {
      filtered.add(offset);
      for (int i = offset.startOffset(); i < offset.endOffset(); i++) {
        seen.add(i);
      }
    }
  }
  if (startComparator != null) {
    Collections.sort(filtered, startComparator);
  }
  return filtered;
}
 
开发者ID:tballison,项目名称:lucene-addons,代码行数:29,代码来源:OffsetUtil.java

示例8: tokenize

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
/**
 * Tokenizza un testo utilizzando il filtro passato come parametro
 *
 * @param text testo da tokenizzare
 * @param analyzer analizzatore sintattico
 * @param tokens numero massimo di token da tenere
 * @param filter filtro di tokenizzazione. Questa interfaccia permette di
 * inserire una logica durante la tokenizzazione. Alla ricezione di un token
 * il sistema applica una logica. Ad esempio unisce tutti i metodi in una
 * stringa ma mano che lo riceve
 * @throws Exception Eccezione
 */
public static void tokenize(String text, Analyzer analyzer, int tokens, TokenizerFilter filter) throws Exception {
    if (text == null) {
        return;
    }
    if (analyzer == null) {
        return;
    }
    text = text.toLowerCase();
    Matcher m = preplace.matcher(text);
    StringBuffer sb = new StringBuffer();
    while (m.find()) {
        m.appendReplacement(sb, " ");
    }
    m.appendTail(sb);
    TokenStream tokenStream = analyzer.tokenStream(BODY, new StringReader(sb.toString()));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    int tokenNumber = 0;
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String term = charTermAttribute.toString();
        filter.applyTo(term);
        tokenNumber++;
        if (tokens != -1) {
            if (tokenNumber > tokens) {
                break;
            }
        }
    }
    tokenStream.close();
}
 
开发者ID:fiohol,项目名称:theSemProject,代码行数:46,代码来源:Tokenizer.java

示例9: createChainProximityQueryClauses

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException {
    Term termArr[] = new Term[2];
    long offsetArr[] = new long[2];
    for(int i=0;i<2;i++) {
        termArr[i] = null;
        offsetArr[i] = 0;
    }

    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        if(termArr[0] == null) {
            termArr[0] = t;
            offsetArr[0] = offsetAtt.startOffset();
        } else if(termArr[1] == null) {
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        } else {
            // shift
            termArr[0] = termArr[1];
            offsetArr[0] = offsetArr[1];
            // fill
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        }
        
        if(termArr[0] != null && termArr[1] != null) {
            long offsetDiff = offsetArr[1] - offsetArr[0];
            if(offsetDiff > 0) {
                PhraseQuery.Builder pq = new PhraseQuery.Builder();

                pq.setSlop((int) (offsetDiff) + 1);
                pq.add(termArr[0]);
                pq.add(termArr[1]);

                builder.add(pq.build(), BooleanClause.Occur.SHOULD);
            }
        }
    }
}
 
开发者ID:iychoi,项目名称:biospectra,代码行数:40,代码来源:Classifier.java

示例10: buildIterator

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
private Iterator<Token> buildIterator() throws IOException
{
    // TODO: use incrementToken() somehow?
    if(!done && source.incrementToken())
    {
        CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = null;
        if(source.hasAttribute(TypeAttribute.class))
        {
            typeAtt = source.getAttribute(TypeAttribute.class);
        }
        PositionIncrementAttribute posIncAtt = null;
        if(source.hasAttribute(PositionIncrementAttribute.class))
        {
            posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
        }
        Token token = new Token(cta.buffer(), 0, cta.length(), offsetAtt.startOffset(), offsetAtt.endOffset());
        if(typeAtt != null)
        {
            token.setType(typeAtt.type());
        }
        if(posIncAtt != null)
        {
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
        }
        return buildIterator(token);
    }
    else
    {
        done = true;
        return buildIterator(null);
    }
    

}
 
开发者ID:Alfresco,项目名称:community-edition-old,代码行数:37,代码来源:MLTokenDuplicator.java

示例11: alreadySeen

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
private static boolean alreadySeen(OffsetAttribute offset, Set<Integer> seen) {
  for (int i = offset.startOffset(); i <= offset.endOffset(); i++) {
    if (seen.contains(i))
      return true;
  }
  return false;
}
 
开发者ID:tballison,项目名称:lucene-addons,代码行数:8,代码来源:OffsetUtil.java

示例12: compare

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
@Override
public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {

  if (offsetA.startOffset() < offsetB.startOffset()) {
    return -1;
  } else if (offsetA.startOffset() > offsetB.startOffset()) {
    return 1;
  }
  return 0;
}
 
开发者ID:tballison,项目名称:lucene-addons,代码行数:11,代码来源:OffsetStartComparator.java

示例13: getCharOffsetRequests

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
/**
 * Simple utility method to build a TokenCharOffsetRequests object
 * from a list of desired tokenOffsets, the number of tokensBefore
 * and the number of tokensAfter.
 *
 * @param tokenOffsets the tokenOffsets that are desired
 * @param tokensBefore the number of tokens before a desired tokenOffset
 * @param tokensAfter  the number of tokens after a desired tokenOffset
 * @param requests     an empty requests to be filled in
 */
public static void getCharOffsetRequests(
    List<OffsetAttribute> tokenOffsets,
    int tokensBefore, int tokensAfter,
    TokenCharOffsetRequests requests) {

  for (OffsetAttribute tokenOffset : tokenOffsets) {
    int start = tokenOffset.startOffset() - tokensBefore;
    start = (start < 0) ? 0 : start;
    int end = tokenOffset.endOffset() + tokensAfter + 1;
    for (int i = start; i < end; i++) {
      requests.add(i);
    }
  }
}
 
开发者ID:tballison,项目名称:lucene-addons,代码行数:25,代码来源:ConcordanceSearcherUtil.java

示例14: markUp

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
public TextFragment markUp(OffsetAttribute offsetAtt, Object termAtt,
		TokenGroup tokenGroup, String text, String[] tokenText,
		int[] startOffset, int[] endOffset, int[] lastEndOffset,
		StringBuilder newText,
		ArrayList<TextFragment> docFrags, TextFragment currentFrag, boolean isDistinct)
		throws InvalidTokenOffsetsException {
	
       logger.trace("text:{} / {}~{}", termAtt, startOffset[0], endOffset[0]);

       if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
           throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length() + " / for offset " + offsetAtt.startOffset() + "~" + offsetAtt.endOffset() );
       }

	logger.trace("numTokens:{} / distinct:{}", tokenGroup.numTokens, tokenGroup.isDistinct());
	if (tokenGroup.numTokens > 0) {
		// the current token is distinct from previous tokens -
		// markup the cached token group info
		startOffset[0] = tokenGroup.matchStartOffset;
		endOffset[0] = tokenGroup.matchEndOffset;
		tokenText[0] = text.substring(startOffset[0], endOffset[0]);
		
		String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText[0]), tokenGroup);
		
		logger.trace("text:{} / newText:{} / token:{} / markedUp:{} / startOffset:{} / lastEndOffset:{}", text, newText, tokenText, markedUpText, startOffset, lastEndOffset);

		if (startOffset[0] > lastEndOffset[0]) {
			newText.append(encoder.encodeText(text.substring(lastEndOffset[0], startOffset[0])));
			termSet.add(new TermSorted(encoder.encodeText(text.substring(lastEndOffset[0], startOffset[0])), null, lastEndOffset[0], startOffset[0]));
		}
		termSet.add(new TermSorted(tokenText[0], markedUpText, startOffset[0], endOffset[0]));

		logger.trace("TERMSET:{}", termSet);

		newText.append(markedUpText);
		lastEndOffset[0] = Math.max(endOffset[0], lastEndOffset[0]);
		
		logger.trace("newText:{}", newText);
		
		if(isDistinct) {
			tokenGroup.clear();
			// check if current token marks the start of a new fragment
			if (textFragmenter.isNewFragment()) {
				currentFrag.setScore(fragmentScorer.getFragmentScore());
				// record stats for a new fragment
				currentFrag.textEndPos = newText.length();
				currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
				fragmentScorer.startFragment(currentFrag);
				docFrags.add(currentFrag);
			}
		}
	}
	return currentFrag;
}
 
开发者ID:gncloud,项目名称:fastcatsearch3,代码行数:54,代码来源:Highlighter.java

示例15: testOutputComponentTypes

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //导入方法依赖的package包/类
public void testOutputComponentTypes() throws IOException {
  String test = "The quick red fox jumped over the lazy brown dogs";

  TokenTypeSplitFilter ttsf = new TokenTypeSplitFilter(new Blah(whitespaceMockTokenizer(test)), Collections.singleton("even"),
      Collections.EMPTY_SET, "even_fork", "even_orig");
  TokenTypeSplitFilter ttsfOdd = new TokenTypeSplitFilter(ttsf, Collections.singleton("odd"),
      Collections.EMPTY_SET, "odd_fork", "odd_orig");
  TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(ttsfOdd, new String[] {"even_orig", "even_fork"}, "joined", null, "!", true, true);
  int count = 0;
  TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class);
  OffsetAttribute offsetAtt = ttjf.getAttribute(OffsetAttribute.class);
  PositionIncrementAttribute posIncrAtt = ttjf.getAttribute(PositionIncrementAttribute.class);
  CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class);
  PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class);
  String lastTerm = null;
  int lastStartOffset = -1;
  int lastEndOffset = -1;
  ttjf.reset();
  while (ttjf.incrementToken()) {
    String term = termAtt.toString();
    String type = typeAtt.type();
    int startOffset = offsetAtt.startOffset();
    int endOffset = offsetAtt.endOffset();
    int posIncr = posIncrAtt.getPositionIncrement();
    BytesRef payload = payloadAtt.getPayload();
    switch (count % 5) {
      case 0:
        assertEquals("even_orig", type);
        assertEquals(1, posIncr);
        assertEquals(lastEndOffset + 1, startOffset);
        assertNull(payload);
        break;
      case 1:
        assertEquals("even_fork", type);
        assertEquals(lastTerm, term);
        assertEquals(0, posIncr);
        assertEquals(lastStartOffset, startOffset);
        assertEquals(lastEndOffset, endOffset);
        assertNull(payload);
        break;
      case 2:
        assertEquals("joined", type);
        assertEquals(0, posIncr);
        assertEquals(lastStartOffset, startOffset);
        String[] split = term.split("!");
        assertEquals(split[0], split[1]);
        assertNull(payload);
        break;
      case 3:
        assertEquals("odd_orig", type);
        assertEquals(1, posIncr);
        assertEquals(lastEndOffset + 1, startOffset);
        assertNull(payload);
        break;
      case 4:
        assertEquals("odd_fork", type);
        assertEquals(lastTerm, term);
        assertEquals(0, posIncr);
        assertEquals(lastStartOffset, startOffset);
        assertEquals(lastEndOffset, endOffset);
        assertNull(payload);
        break;
    }
    lastTerm = term;
    lastStartOffset = startOffset;
    lastEndOffset = endOffset;
    count++;
  }
  assertTrue(count + " does not equal: " + 25, count == 25);

}
 
开发者ID:upenn-libraries,项目名称:solrplugins,代码行数:72,代码来源:TokenTypeJoinFilterTest.java


注:本文中的org.apache.lucene.analysis.tokenattributes.OffsetAttribute.startOffset方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。