本文整理汇总了C#中Lucene.Net.Analysis.Analyzer.TokenStream方法的典型用法代码示例。如果您正苦于以下问题:C# Analyzer.TokenStream方法的具体用法?C# Analyzer.TokenStream怎么用?C# Analyzer.TokenStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Lucene.Net.Analysis.Analyzer
的用法示例。
在下文中一共展示了Analyzer.TokenStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: FormSimilarQuery
/// <summary> Simple similarity query generators.
/// Takes every unique word and forms a boolean query where all words are optional.
/// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
/// The only caveat is the first hit returned <b>should be</b> your source document - you'll
/// need to then ignore that.
///
/// <p/>
///
/// So, if you have a code fragment like this:
/// <br/>
/// <code>
/// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
/// </code>
///
/// <p/>
///
/// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
///
/// <p/>
/// The philosophy behind this method is "two documents are similar if they share lots of words".
/// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
///
/// <P/>
/// This method is fail-safe in that if a long 'body' is passed in and
/// <see cref="BooleanQuery.Add"/> (used internally)
/// throws
/// <see cref="BooleanQuery.TooManyClauses"/>, the
/// query as it is will be returned.
/// </summary>
/// <param name="body">the body of the document you want to find similar documents to
/// </param>
/// <param name="a">the analyzer to use to parse the body
/// </param>
/// <param name="field">the field you want to search on, probably something like "contents" or "body"
/// </param>
/// <param name="stop">optional set of stop words to ignore
/// </param>
/// <returns> a query with all unique words in 'body'
/// </returns>
/// <throws> IOException this can't happen... </throws>
public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop)
{
TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();
BooleanQuery tmp = new BooleanQuery();
ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); // ignore dups
while (ts.IncrementToken())
{
String word = termAtt.Term;
// ignore opt stop words
if (stop != null && stop.Contains(word))
continue;
// ignore dups
if (already.Contains(word))
continue;
already.Add(word);
// add to query
TermQuery tq = new TermQuery(new Term(field, word));
try
{
tmp.Add(tq, Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses)
{
// fail-safe, just return what we have, not the end of the world
break;
}
}
return tmp;
}
示例2: Highlight
public static void Highlight(Document d, string query, Analyzer analyzer)
{
string contents = d.Get("contents");
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"highlight\"><b>", "</b></span>");
//SpanGradientFormatter formatter = new SpanGradientFormatter(10.0f, null, null, "#F1FD9F", "#EFF413");
//SimpleHTMLEncoder encoder = new SimpleHTMLEncoder();
SimpleFragmenter fragmenter = new SimpleFragmenter(250);
Highlighter hiliter = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer)));
hiliter.SetTextFragmenter(fragmenter);
int numfragments = contents.Length / fragmenter.GetFragmentSize() + 1;// +1 ensures its never zero. More than the required number of fragments dont harm.
StringBuilder result = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><head><title>Search Results - ");
result.Append(d.Get("filename"));
result.Append("</title></head><body><font face=Arial size=5>");
TokenStream tokenstream = analyzer.TokenStream("contents", new System.IO.StringReader(contents));
TextFragment[] frags = hiliter.GetBestTextFragments(tokenstream, contents, false, numfragments);
foreach (TextFragment frag in frags)
{
if (frag.GetScore() > 0)
{
result.Append(frag.ToString() + "<br/><hr/><br/>");
}
}
string contentspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "contents.html");
result.Append("</font><a target=_self href=\"file:///");
result.Append(contentspath);
result.Append("\">View Original Document...</a>");
result.Append("</body></html>");
result.Replace("\n", "<br/>");
string resultspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "results.html");
System.IO.File.WriteAllText(resultspath, result.ToString());
//webBrowser1.Url = new Uri("file:///" + resultspath);
Highlighter hiliter2 = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer)));
hiliter2.SetTextFragmenter(fragmenter);
TokenStream tokstr = analyzer.TokenStream(new System.IO.StringReader(contents));
StringBuilder htmlcontents = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><body><font face=Arial size=5>");
htmlcontents.Append(hiliter2.GetBestFragments(tokstr, contents, numfragments, "..."));
htmlcontents.Append("</font></body></html>");
htmlcontents.Replace("\n", "<br/>");
System.IO.File.WriteAllText(contentspath, htmlcontents.ToString());
}
示例3: AssertAnalyzesTo
public virtual void AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output)
{
TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));
for (int i = 0; i < output.Length; i++)
{
Token t = ts.Next();
Assert.IsNotNull(t);
Assert.AreEqual(t.TermText(), output[i]);
}
Assert.IsNull(ts.Next());
ts.Close();
}
示例4: AssertAnalyzesTo
protected virtual void AssertAnalyzesTo(Analyzer analyzer, String input, String[] output)
{
var tokenStream = analyzer.TokenStream("dummyFieldName", new StringReader(input));
for( var i = 0; i < output.Length; i++ )
{
var t = tokenStream.Next();
Assert.IsNotNull(t);
Assert.AreEqual(output[i], t.TermText());
}
Assert.IsNull(tokenStream.Next());
tokenStream.Close();
}
示例5: GetTokens
private static List<string> GetTokens(string keywords, Analyzer analyser)
{
var tokenStream = analyser.TokenStream(null, new StringReader(keywords));
var termAttribute = tokenStream.GetAttribute<ITermAttribute>();
tokenStream.Reset();
var list = new List<string>();
while (tokenStream.IncrementToken())
{
var term = termAttribute.Term;
list.Add(term);
}
return list;
}
示例6: GetFieldQuery
// This is a simplified query builder which works for single Terms and single Phrases
// Returns null, TermQuery, or PhraseQuery
public static Lucene.Net.Search.Query GetFieldQuery(Analyzer analyzer, string field, string queryText)
{
TokenStream stream = analyzer.TokenStream(field, new StringReader(queryText));
TokenFilter filter = new CachingTokenFilter(stream);
filter.Reset();
// This attribute way of getting token properties isn't very good, but it's the non-obsolete one.
var attr1 = filter.GetAttribute<ITermAttribute>();
Func<string> getText = () => attr1 != null ? attr1.Term : null;
Func<int> getPositionIncrement;
if (filter.HasAttribute<IPositionIncrementAttribute>())
{
var attr = filter.GetAttribute<IPositionIncrementAttribute>();
getPositionIncrement = () => attr.PositionIncrement;
}
else
{
getPositionIncrement = () => 1;
}
// 0 tokens
if (!filter.IncrementToken())
{
return new BooleanQuery();
}
// 1 token?
string token1 = getText();
int position = 0;
if (!filter.IncrementToken())
{
return new TermQuery(new Term(field, token1));
}
// many tokens - handle first token
PhraseQuery ret = new PhraseQuery();
ret.Add(new Term(field, token1));
do
{
// handle rest of tokens
string tokenNext = getText();
position += getPositionIncrement();
ret.Add(new Term(field, tokenNext), position);
}
while (filter.IncrementToken());
return ret;
}
示例7: TokensFromAnalysis
public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text)
{
TokenStream stream = analyzer.TokenStream("contents", new StringReader(text));
List<string> result = new List<string>();
TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));
while (stream.IncrementToken())
{
result.Add(tokenAttr.Term());
}
stream.End();
stream.Close();
return result;
}
示例8: TokensFromAnalysis
public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text)
{
using (TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)))
{
var result = new List<string>();
var tokenAttr = (TermAttribute) stream.GetAttribute<ITermAttribute>();
while (stream.IncrementToken())
{
result.Add(tokenAttr.Term);
}
stream.End();
return result;
}
}
示例9: DisplayTokenWithPositions
public static void DisplayTokenWithPositions(Analyzer analyzer, string text)
{
var stream = analyzer.TokenStream("contents", new StringReader(text));
var termAttribute = stream.AddAttribute(typeof (TermAttribute)) as TermAttribute;
var positionIncrement =
stream.AddAttribute(typeof (PositionIncrementAttribute)) as PositionIncrementAttribute;
int position = 0;
while (stream.IncrementToken())
{
int increment = positionIncrement.GetPositionIncrement();
if(increment>0)
{
position = position + increment;
Console.WriteLine();
Console.WriteLine("{0}: ", position);
}
Console.WriteLine("[{0}]", termAttribute.Term());
}
Console.WriteLine();
}
示例10: AssertAnalyzesTo
public virtual void AssertAnalyzesTo(Analyzer a, System.String input, System.String[] expectedImages, System.String[] expectedTypes, int[] expectedPosIncrs)
{
TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));
for (int i = 0; i < expectedImages.Length; i++)
{
Token t = ts.Next();
Assert.IsNotNull(t);
Assert.AreEqual(expectedImages[i], t.TermText());
if (expectedTypes != null)
{
Assert.AreEqual(expectedTypes[i], t.Type());
}
if (expectedPosIncrs != null)
{
Assert.AreEqual(expectedPosIncrs[i], t.GetPositionIncrement());
}
}
Assert.IsNull(ts.Next());
ts.Close();
}
示例11: Expand
/// <summary>
/// Perform synonym expansion on a query.
/// </summary>
/// <param name="query">query</param>
/// <param name="syns">syns</param>
/// <param name="a">a</param>
/// <param name="field">field</param>
/// <param name="boost">boost</param>
public static Query Expand(String query,
Searcher syns,
Analyzer a,
String field,
float boost)
{
already = new List<String>(); // avoid dups
var top = new List<String>(); // needs to be separately listed..
var ts = a.TokenStream(field, new StringReader(query));
var termAtt = ts.AddAttribute<TermAttribute>();
while (ts.IncrementToken())
{
var word = termAtt.Term;
if (!already.Contains(word))
{
already.Add(word);
top.Add(word);
}
}
tmp = new BooleanQuery();
// [2] form query
System.Collections.IEnumerator it = top.GetEnumerator();
while (it.MoveNext())
{
// [2a] add to level words in
var word = (String)it.Current;
var tq = new TermQuery(new Term(field, word));
tmp.Add(tq, Occur.SHOULD);
var c = new CollectorImpl(field, boost);
syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
}
return tmp;
}
示例12: GetTokenStream
//convenience method
public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer)
{
return analyzer.TokenStream(field, new StringReader(contents));
}
示例13: CheckAnalysisConsistency
private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field)
{
if (VERBOSE)
{
Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text);
}
ICharTermAttribute termAtt;
IOffsetAttribute offsetAtt;
IPositionIncrementAttribute posIncAtt;
IPositionLengthAttribute posLengthAtt;
ITypeAttribute typeAtt;
IList<string> tokens = new List<string>();
IList<string> types = new List<string>();
IList<int> positions = new List<int>();
IList<int> positionLengths = new List<int>();
IList<int> startOffsets = new List<int>();
IList<int> endOffsets = new List<int>();
int remainder = random.Next(10);
StringReader reader = new StringReader(text);
TokenStream ts;
using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader))
{
termAtt = ts.HasAttribute<ICharTermAttribute>()
? ts.GetAttribute<ICharTermAttribute>()
: null;
offsetAtt = ts.HasAttribute<IOffsetAttribute>()
? ts.GetAttribute<IOffsetAttribute>()
: null;
posIncAtt = ts.HasAttribute<IPositionIncrementAttribute>()
? ts.GetAttribute<IPositionIncrementAttribute>()
: null;
posLengthAtt = ts.HasAttribute<IPositionLengthAttribute>()
? ts.GetAttribute<IPositionLengthAttribute>()
: null;
typeAtt = ts.HasAttribute<ITypeAttribute>() ? ts.GetAttribute<ITypeAttribute>() : null;
ts.Reset();
// First pass: save away "correct" tokens
while (ts.IncrementToken())
{
Assert.IsNotNull(termAtt, "has no CharTermAttribute");
tokens.Add(termAtt.ToString());
if (typeAtt != null)
{
types.Add(typeAtt.Type);
}
if (posIncAtt != null)
{
positions.Add(posIncAtt.PositionIncrement);
}
if (posLengthAtt != null)
{
positionLengths.Add(posLengthAtt.PositionLength);
}
if (offsetAtt != null)
{
startOffsets.Add(offsetAtt.StartOffset());
endOffsets.Add(offsetAtt.EndOffset());
}
}
ts.End();
}
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (tokens.Count > 0)
{
// KWTokenizer (for example) can produce a token
// even when input is length 0:
if (text.Length != 0)
{
// (Optional) second pass: do something evil:
int evilness = random.Next(50);
if (evilness == 17)
{
if (VERBOSE)
{
Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception");
}
// Throw an errant exception from the Reader:
MockReaderWrapper evilReader = new MockReaderWrapper(random, text);
evilReader.ThrowExcAfterChar(random.Next(text.Length));
reader = evilReader;
try
{
// NOTE: some Tokenizers go and read characters
// when you call .setReader(Reader), eg
// PatternTokenizer. this is a bit
// iffy... (really, they should only
// pull from the Reader when you call
// .incremenToken(), I think?), but we
// currently allow it, so, we must call
// a.TokenStream inside the try since we may
// hit the exc on init:
//.........这里部分代码省略.........
示例14: CheckResetException
internal static void CheckResetException(Analyzer a, string input)
{
TokenStream ts = a.TokenStream("bogus", new StringReader(input));
try
{
if (ts.IncrementToken())
{
ts.ReflectAsString(false);
Assert.Fail("didn't get expected exception when reset() not called");
}
}
catch (InvalidOperationException expected)
{
//ok
}
catch (AssertionException expected)
{
// ok: MockTokenizer
Assert.IsTrue(expected.Message != null && expected.Message.Contains("wrong state"), expected.Message);
}
catch (Exception unexpected)
{
//unexpected.printStackTrace(System.err);
Console.Error.WriteLine(unexpected.StackTrace);
Assert.Fail("got wrong exception when reset() not called: " + unexpected);
}
finally
{
// consume correctly
ts.Reset();
while (ts.IncrementToken())
{
}
ts.End();
ts.Dispose();
}
// check for a missing Close()
ts = a.TokenStream("bogus", new StringReader(input));
ts.Reset();
while (ts.IncrementToken())
{
}
ts.End();
try
{
ts = a.TokenStream("bogus", new StringReader(input));
Assert.Fail("didn't get expected exception when Close() not called");
}
catch (Exception)
{
// ok
}
finally
{
ts.Dispose();
}
}
示例15: AnalyzeMultitermTerm
protected internal virtual BytesRef AnalyzeMultitermTerm(string field, string part, Analyzer analyzerIn)
{
if (analyzerIn == null) analyzerIn = Analyzer;
TokenStream source = null;
try
{
source = analyzerIn.TokenStream(field, part);
source.Reset();
ITermToBytesRefAttribute termAtt = source.GetAttribute<ITermToBytesRefAttribute>();
BytesRef bytes = termAtt.BytesRef;
if (!source.IncrementToken())
throw new ArgumentException("analyzer returned no terms for multiTerm term: " + part);
termAtt.FillBytesRef();
if (source.IncrementToken())
throw new ArgumentException("analyzer returned too many terms for multiTerm term: " + part);
source.End();
return BytesRef.DeepCopyOf(bytes);
}
catch (IOException e)
{
throw new Exception("Error analyzing multiTerm term: " + part, e);
}
finally
{
IOUtils.CloseWhileHandlingException(source);
}
}