本文整理匯總了Java中org.apache.lucene.analysis.tokenattributes.OffsetAttribute.startOffset方法的典型用法代碼示例。如果您正苦於以下問題:Java OffsetAttribute.startOffset方法的具體用法?Java OffsetAttribute.startOffset怎麽用?Java OffsetAttribute.startOffset使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.lucene.analysis.tokenattributes.OffsetAttribute
的用法示例。
在下文中一共展示了OffsetAttribute.startOffset方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: assertOffsets
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
static private void assertOffsets(String inputStr, TokenStream tokenStream, List<String> expected) {
try {
List<String> termList = new ArrayList<String>();
// CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
while (tokenStream.incrementToken()) {
int start = offsetAttr.startOffset();
int end = offsetAttr.endOffset();
termList.add(inputStr.substring(start, end));
}
System.out.println(String.join(" ", termList));
assertThat(termList, is(expected));
} catch (IOException e) {
assertTrue(false);
}
}
示例2: displayTokens
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
protected void displayTokens(String text, String elementId) throws IOException {
if (log.isDebugEnabled()) {
Analyzer analyzer = getConfiguredAnalyzer();
StringBuilder sb = new StringBuilder();
sb.append(elementId).append(": ").append(text).append(": ");
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
}
log.debug(sb);
}
}
示例3: walkTokens
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
private String[] walkTokens() throws IOException {
List<String> wordList = new ArrayList<>();
while (input.incrementToken()) {
CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
char[] buffer = textAtt.buffer();
String word = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
wordList.add(word);
AttributeSource attrs = input.cloneAttributes();
tokenAttrs.add(attrs);
}
String[] words = new String[wordList.size()];
for (int i = 0; i < words.length; i++) {
words[i] = wordList.get(i);
}
return words;
}
示例4: handleTokenStream
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
private void handleTokenStream(Map<Integer, List<Token>> tokenPosMap, TokenStream tokenStream) throws IOException {
tokenStream.reset();
int pos = 0;
CharTermAttribute charTermAttribute = getCharTermAttribute(tokenStream);
OffsetAttribute offsetAttribute = getOffsetAttribute(tokenStream);
TypeAttribute typeAttribute = getTypeAttribute(tokenStream);
PositionIncrementAttribute positionIncrementAttribute = getPositionIncrementAttribute(tokenStream);
while (tokenStream.incrementToken()) {
if (null == charTermAttribute || null == offsetAttribute) {
return;
}
Token token = new Token(charTermAttribute.buffer(), 0, charTermAttribute.length(),
offsetAttribute.startOffset(), offsetAttribute.endOffset());
if (null != typeAttribute) {
token.setType(typeAttribute.type());
}
pos += null != positionIncrementAttribute ? positionIncrementAttribute.getPositionIncrement() : 1;
if (!tokenPosMap.containsKey(pos)) {
tokenPosMap.put(pos, new LinkedList<Token>());
}
tokenPosMap.get(pos).add(token);
}
tokenStream.close();
}
示例5: searchSingleWord
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
/**
* Searches for a single word and updates the ranges.
*
* @param tokenizer - The Lucene Tokenizer with the complete text of chapter.
* @param searchString - The word to search for.
* @param currentChapter - The chapter to search in.
* @param ranges - The ranges of the found words will be added here.
* @param documentLength - The length of the whole document.
* @throws IOException
*/
private void searchSingleWord(Tokenizer tokenizer, String searchString, Chapter currentChapter,
List<Range> ranges, int documentLength) throws IOException {
// will be incremented
CharTermAttribute charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.getAttribute(OffsetAttribute.class);
tokenizer.reset();
while (tokenizer.incrementToken()) {
if (charTermAttrib.toString().toLowerCase().matches(searchString.toLowerCase())) {
int startOffset = offset.startOffset() + currentChapter.getRange().getStart().getOffset();
int endOffset = offset.endOffset() + currentChapter.getRange().getStart().getOffset();
ranges.add(new Range(TextPosition.fromGlobalOffset(startOffset, documentLength),
TextPosition.fromGlobalOffset(endOffset, documentLength)));
}
}
}
示例6: compare
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
@Override
public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {
int lenA = offsetA.endOffset() - offsetA.startOffset();
int lenB = offsetB.endOffset() - offsetB.startOffset();
if (lenA < lenB) {
return 1;
} else if (lenA > lenB) {
return -1;
// by here, the length is the same
} else if (offsetA.startOffset() < offsetB.startOffset()) {
return -1;
} else if (offsetA.startOffset() > offsetB.startOffset()) {
return 1;
}
return 0;
}
示例7: removeOverlapsAndSort
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
/**
* @param offsets offsets to process
* @param comparator initial OffsetLengthStartComparator to use to rule out overlaps
* @param startComparator comparator for final sort
* @return sorted list of offsets
*/
public static List<OffsetAttribute> removeOverlapsAndSort(
List<OffsetAttribute> offsets, OffsetLengthStartComparator comparator,
OffsetStartComparator startComparator) {
if (offsets == null || offsets.size() < 2)
return offsets;
Collections.sort(offsets, comparator);
Set<Integer> seen = new HashSet<>();
List<OffsetAttribute> filtered = new ArrayList<>();
for (OffsetAttribute offset : offsets) {
if (!alreadySeen(offset, seen)) {
filtered.add(offset);
for (int i = offset.startOffset(); i < offset.endOffset(); i++) {
seen.add(i);
}
}
}
if (startComparator != null) {
Collections.sort(filtered, startComparator);
}
return filtered;
}
示例8: tokenize
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
/**
* Tokenizza un testo utilizzando il filtro passato come parametro
*
* @param text testo da tokenizzare
* @param analyzer analizzatore sintattico
* @param tokens numero massimo di token da tenere
* @param filter filtro di tokenizzazione. Questa interfaccia permette di
* inserire una logica durante la tokenizzazione. Alla ricezione di un token
* il sistema applica una logica. Ad esempio unisce tutti i metodi in una
* stringa ma mano che lo riceve
* @throws Exception Eccezione
*/
public static void tokenize(String text, Analyzer analyzer, int tokens, TokenizerFilter filter) throws Exception {
if (text == null) {
return;
}
if (analyzer == null) {
return;
}
text = text.toLowerCase();
Matcher m = preplace.matcher(text);
StringBuffer sb = new StringBuffer();
while (m.find()) {
m.appendReplacement(sb, " ");
}
m.appendTail(sb);
TokenStream tokenStream = analyzer.tokenStream(BODY, new StringReader(sb.toString()));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
int tokenNumber = 0;
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
filter.applyTo(term);
tokenNumber++;
if (tokens != -1) {
if (tokenNumber > tokens) {
break;
}
}
}
tokenStream.close();
}
示例9: createChainProximityQueryClauses
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException {
Term termArr[] = new Term[2];
long offsetArr[] = new long[2];
for(int i=0;i<2;i++) {
termArr[i] = null;
offsetArr[i] = 0;
}
while (stream.incrementToken()) {
Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
if(termArr[0] == null) {
termArr[0] = t;
offsetArr[0] = offsetAtt.startOffset();
} else if(termArr[1] == null) {
termArr[1] = t;
offsetArr[1] = offsetAtt.startOffset();
} else {
// shift
termArr[0] = termArr[1];
offsetArr[0] = offsetArr[1];
// fill
termArr[1] = t;
offsetArr[1] = offsetAtt.startOffset();
}
if(termArr[0] != null && termArr[1] != null) {
long offsetDiff = offsetArr[1] - offsetArr[0];
if(offsetDiff > 0) {
PhraseQuery.Builder pq = new PhraseQuery.Builder();
pq.setSlop((int) (offsetDiff) + 1);
pq.add(termArr[0]);
pq.add(termArr[1]);
builder.add(pq.build(), BooleanClause.Occur.SHOULD);
}
}
}
}
示例10: buildIterator
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
private Iterator<Token> buildIterator() throws IOException
{
// TODO: use incrementToken() somehow?
if(!done && source.incrementToken())
{
CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = null;
if(source.hasAttribute(TypeAttribute.class))
{
typeAtt = source.getAttribute(TypeAttribute.class);
}
PositionIncrementAttribute posIncAtt = null;
if(source.hasAttribute(PositionIncrementAttribute.class))
{
posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
}
Token token = new Token(cta.buffer(), 0, cta.length(), offsetAtt.startOffset(), offsetAtt.endOffset());
if(typeAtt != null)
{
token.setType(typeAtt.type());
}
if(posIncAtt != null)
{
token.setPositionIncrement(posIncAtt.getPositionIncrement());
}
return buildIterator(token);
}
else
{
done = true;
return buildIterator(null);
}
}
示例11: alreadySeen
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
private static boolean alreadySeen(OffsetAttribute offset, Set<Integer> seen) {
for (int i = offset.startOffset(); i <= offset.endOffset(); i++) {
if (seen.contains(i))
return true;
}
return false;
}
示例12: compare
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
@Override
public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {
if (offsetA.startOffset() < offsetB.startOffset()) {
return -1;
} else if (offsetA.startOffset() > offsetB.startOffset()) {
return 1;
}
return 0;
}
示例13: getCharOffsetRequests
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
/**
* Simple utility method to build a TokenCharOffsetRequests object
* from a list of desired tokenOffsets, the number of tokensBefore
* and the number of tokensAfter.
*
* @param tokenOffsets the tokenOffsets that are desired
* @param tokensBefore the number of tokens before a desired tokenOffset
* @param tokensAfter the number of tokens after a desired tokenOffset
* @param requests an empty requests to be filled in
*/
public static void getCharOffsetRequests(
List<OffsetAttribute> tokenOffsets,
int tokensBefore, int tokensAfter,
TokenCharOffsetRequests requests) {
for (OffsetAttribute tokenOffset : tokenOffsets) {
int start = tokenOffset.startOffset() - tokensBefore;
start = (start < 0) ? 0 : start;
int end = tokenOffset.endOffset() + tokensAfter + 1;
for (int i = start; i < end; i++) {
requests.add(i);
}
}
}
示例14: markUp
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
public TextFragment markUp(OffsetAttribute offsetAtt, Object termAtt,
TokenGroup tokenGroup, String text, String[] tokenText,
int[] startOffset, int[] endOffset, int[] lastEndOffset,
StringBuilder newText,
ArrayList<TextFragment> docFrags, TextFragment currentFrag, boolean isDistinct)
throws InvalidTokenOffsetsException {
logger.trace("text:{} / {}~{}", termAtt, startOffset[0], endOffset[0]);
if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length() + " / for offset " + offsetAtt.startOffset() + "~" + offsetAtt.endOffset() );
}
logger.trace("numTokens:{} / distinct:{}", tokenGroup.numTokens, tokenGroup.isDistinct());
if (tokenGroup.numTokens > 0) {
// the current token is distinct from previous tokens -
// markup the cached token group info
startOffset[0] = tokenGroup.matchStartOffset;
endOffset[0] = tokenGroup.matchEndOffset;
tokenText[0] = text.substring(startOffset[0], endOffset[0]);
String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText[0]), tokenGroup);
logger.trace("text:{} / newText:{} / token:{} / markedUp:{} / startOffset:{} / lastEndOffset:{}", text, newText, tokenText, markedUpText, startOffset, lastEndOffset);
if (startOffset[0] > lastEndOffset[0]) {
newText.append(encoder.encodeText(text.substring(lastEndOffset[0], startOffset[0])));
termSet.add(new TermSorted(encoder.encodeText(text.substring(lastEndOffset[0], startOffset[0])), null, lastEndOffset[0], startOffset[0]));
}
termSet.add(new TermSorted(tokenText[0], markedUpText, startOffset[0], endOffset[0]));
logger.trace("TERMSET:{}", termSet);
newText.append(markedUpText);
lastEndOffset[0] = Math.max(endOffset[0], lastEndOffset[0]);
logger.trace("newText:{}", newText);
if(isDistinct) {
tokenGroup.clear();
// check if current token marks the start of a new fragment
if (textFragmenter.isNewFragment()) {
currentFrag.setScore(fragmentScorer.getFragmentScore());
// record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
}
return currentFrag;
}
示例15: testOutputComponentTypes
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; //導入方法依賴的package包/類
public void testOutputComponentTypes() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
TokenTypeSplitFilter ttsf = new TokenTypeSplitFilter(new Blah(whitespaceMockTokenizer(test)), Collections.singleton("even"),
Collections.EMPTY_SET, "even_fork", "even_orig");
TokenTypeSplitFilter ttsfOdd = new TokenTypeSplitFilter(ttsf, Collections.singleton("odd"),
Collections.EMPTY_SET, "odd_fork", "odd_orig");
TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(ttsfOdd, new String[] {"even_orig", "even_fork"}, "joined", null, "!", true, true);
int count = 0;
TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class);
OffsetAttribute offsetAtt = ttjf.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAtt = ttjf.getAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class);
PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class);
String lastTerm = null;
int lastStartOffset = -1;
int lastEndOffset = -1;
ttjf.reset();
while (ttjf.incrementToken()) {
String term = termAtt.toString();
String type = typeAtt.type();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
int posIncr = posIncrAtt.getPositionIncrement();
BytesRef payload = payloadAtt.getPayload();
switch (count % 5) {
case 0:
assertEquals("even_orig", type);
assertEquals(1, posIncr);
assertEquals(lastEndOffset + 1, startOffset);
assertNull(payload);
break;
case 1:
assertEquals("even_fork", type);
assertEquals(lastTerm, term);
assertEquals(0, posIncr);
assertEquals(lastStartOffset, startOffset);
assertEquals(lastEndOffset, endOffset);
assertNull(payload);
break;
case 2:
assertEquals("joined", type);
assertEquals(0, posIncr);
assertEquals(lastStartOffset, startOffset);
String[] split = term.split("!");
assertEquals(split[0], split[1]);
assertNull(payload);
break;
case 3:
assertEquals("odd_orig", type);
assertEquals(1, posIncr);
assertEquals(lastEndOffset + 1, startOffset);
assertNull(payload);
break;
case 4:
assertEquals("odd_fork", type);
assertEquals(lastTerm, term);
assertEquals(0, posIncr);
assertEquals(lastStartOffset, startOffset);
assertEquals(lastEndOffset, endOffset);
assertNull(payload);
break;
}
lastTerm = term;
lastStartOffset = startOffset;
lastEndOffset = endOffset;
count++;
}
assertTrue(count + " does not equal: " + 25, count == 25);
}