当前位置: 首页>>代码示例>>C#>>正文


C# Analyzer.TokenStream方法代码示例

本文整理汇总了C#中Lucene.Net.Analysis.Analyzer.TokenStream方法的典型用法代码示例。如果您正苦于以下问题:C# Analyzer.TokenStream方法的具体用法?C# Analyzer.TokenStream怎么用?C# Analyzer.TokenStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Lucene.Net.Analysis.Analyzer的用法示例。


在下文中一共展示了Analyzer.TokenStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。

示例1: FormSimilarQuery

        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        /// 
        /// <p/>
        /// 
        /// So, if you have a code fragment like this:
        /// <br/>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        /// 
        /// <p/>
        /// 
        ///  The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
        /// 
        /// <p/>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        /// 
        /// <P/>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// <see cref="BooleanQuery.Add"/> (used internally)
        /// throws
        /// <see cref="BooleanQuery.TooManyClauses"/>, the
        /// query as it is will be returned.
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop)
        {
            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
            ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();

            BooleanQuery tmp = new BooleanQuery();
            ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); // ignore dups
            while (ts.IncrementToken())
            {
                String word = termAtt.Term;
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                    continue;
                // ignore dups
                if (already.Contains(word))
                    continue;
                already.Add(word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, Occur.SHOULD);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return tmp;
        }
开发者ID:raol,项目名称:lucene.net,代码行数:71,代码来源:SimilarityQueries.cs

示例2: Highlight

        public static void Highlight(Document d, string query, Analyzer analyzer)
        {
            string contents = d.Get("contents");
            SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"highlight\"><b>", "</b></span>");
            //SpanGradientFormatter formatter = new SpanGradientFormatter(10.0f, null, null, "#F1FD9F", "#EFF413");
            //SimpleHTMLEncoder encoder = new SimpleHTMLEncoder();
            SimpleFragmenter fragmenter = new SimpleFragmenter(250);
            Highlighter hiliter = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer)));

            hiliter.SetTextFragmenter(fragmenter);
            int numfragments = contents.Length / fragmenter.GetFragmentSize() + 1;// +1 ensures its never zero. More than the required number of fragments dont harm.
            StringBuilder result = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><head><title>Search Results - ");
            result.Append(d.Get("filename"));
            result.Append("</title></head><body><font face=Arial size=5>");
            TokenStream tokenstream = analyzer.TokenStream("contents", new System.IO.StringReader(contents));
            TextFragment[] frags = hiliter.GetBestTextFragments(tokenstream, contents, false, numfragments);
            foreach (TextFragment frag in frags)
            {
                if (frag.GetScore() > 0)
                {
                    result.Append(frag.ToString() + "<br/><hr/><br/>");

                }

            }

            string contentspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "contents.html");
            result.Append("</font><a target=_self href=\"file:///");
            result.Append(contentspath);
            result.Append("\">View Original Document...</a>");
            result.Append("</body></html>");
            result.Replace("\n", "<br/>");

            string resultspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "results.html");
            System.IO.File.WriteAllText(resultspath, result.ToString());
            //webBrowser1.Url = new Uri("file:///" + resultspath);

            Highlighter hiliter2 = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer)));
            hiliter2.SetTextFragmenter(fragmenter);
            TokenStream tokstr = analyzer.TokenStream(new System.IO.StringReader(contents));
            StringBuilder htmlcontents = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><body><font face=Arial size=5>");
            htmlcontents.Append(hiliter2.GetBestFragments(tokstr, contents, numfragments, "..."));
            htmlcontents.Append("</font></body></html>");
            htmlcontents.Replace("\n", "<br/>");
            System.IO.File.WriteAllText(contentspath, htmlcontents.ToString());
        }
开发者ID:usmanghani,项目名称:Misc,代码行数:46,代码来源:ResultHighlighter.cs

示例3: AssertAnalyzesTo

		public virtual void  AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output)
		{
			TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));
			for (int i = 0; i < output.Length; i++)
			{
				Token t = ts.Next();
				Assert.IsNotNull(t);
				Assert.AreEqual(t.TermText(), output[i]);
			}
			Assert.IsNull(ts.Next());
			ts.Close();
		}
开发者ID:vikasraz,项目名称:indexsearchutils,代码行数:12,代码来源:TestAnalyzers.cs

示例4: AssertAnalyzesTo

 protected virtual void AssertAnalyzesTo(Analyzer analyzer, String input, String[] output)
 {
     var tokenStream = analyzer.TokenStream("dummyFieldName", new StringReader(input));
     for( var i = 0; i < output.Length; i++ )
     {
         var t = tokenStream.Next();
         Assert.IsNotNull(t);
         Assert.AreEqual(output[i], t.TermText());
     }
     Assert.IsNull(tokenStream.Next());
     tokenStream.Close();
 }
开发者ID:DavidMoore,项目名称:Foundation,代码行数:12,代码来源:SnowballAnalyzerFixture.cs

示例5: GetTokens

 private static List<string> GetTokens(string keywords, Analyzer analyser)
 {
     var tokenStream = analyser.TokenStream(null, new StringReader(keywords));
     var termAttribute = tokenStream.GetAttribute<ITermAttribute>();
     tokenStream.Reset();
     var list = new List<string>();
     while (tokenStream.IncrementToken())
     {
         var term = termAttribute.Term;
         list.Add(term);
     }
     return list;
 }
开发者ID:neozhu,项目名称:MrCMS,代码行数:13,代码来源:LuceneQueryStringHelper.cs

示例6: GetFieldQuery

        // This is a simplified query builder which works for single Terms and single Phrases
        // Returns null, TermQuery, or PhraseQuery
        public static Lucene.Net.Search.Query GetFieldQuery(Analyzer analyzer, string field, string queryText)
        {
            TokenStream stream = analyzer.TokenStream(field, new StringReader(queryText));
            TokenFilter filter = new CachingTokenFilter(stream);
            filter.Reset();

            // This attribute way of getting token properties isn't very good, but it's the non-obsolete one.
            var attr1 = filter.GetAttribute<ITermAttribute>();
            Func<string> getText = () => attr1 != null ? attr1.Term : null;

            Func<int> getPositionIncrement;
            if (filter.HasAttribute<IPositionIncrementAttribute>())
            {
                var attr = filter.GetAttribute<IPositionIncrementAttribute>();
                getPositionIncrement = () => attr.PositionIncrement;
            }
            else
            {
                getPositionIncrement = () => 1;
            }

            // 0 tokens
            if (!filter.IncrementToken())
            {
                return new BooleanQuery();
            }

            // 1 token?
            string token1 = getText();
            int position = 0;
            if (!filter.IncrementToken())
            {
                return new TermQuery(new Term(field, token1));
            }

            // many tokens - handle first token
            PhraseQuery ret = new PhraseQuery();
            ret.Add(new Term(field, token1));

            do
            {
                // handle rest of tokens
                string tokenNext = getText();
                position += getPositionIncrement();
                ret.Add(new Term(field, tokenNext), position);
            }
            while (filter.IncrementToken());

            return ret;
        }
开发者ID:atrevisan,项目名称:NuGetGallery,代码行数:52,代码来源:AnalysisHelper.cs

示例7: TokensFromAnalysis

        public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text)
        {
            TokenStream stream = analyzer.TokenStream("contents", new StringReader(text));
            List<string> result = new List<string>();
            TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));

            while (stream.IncrementToken())
            {
                result.Add(tokenAttr.Term());
            }

            stream.End();
            stream.Close();

            return result;
        }
开发者ID:vinone,项目名称:ravendb,代码行数:16,代码来源:LuceneAnalyzerUtils.cs

示例8: TokensFromAnalysis

		public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text)
		{
			using (TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)))
			{
				var result = new List<string>();
				var tokenAttr = (TermAttribute) stream.GetAttribute<ITermAttribute>();

				while (stream.IncrementToken())
				{
					result.Add(tokenAttr.Term);
				}

				stream.End();

				return result;
			}
		}
开发者ID:925coder,项目名称:ravendb,代码行数:17,代码来源:LuceneAnalyzerUtils.cs

示例9: DisplayTokenWithPositions

 public static void DisplayTokenWithPositions(Analyzer analyzer, string text)
 {
     var stream = analyzer.TokenStream("contents", new StringReader(text));
     var termAttribute = stream.AddAttribute(typeof (TermAttribute)) as TermAttribute;
     var positionIncrement =
         stream.AddAttribute(typeof (PositionIncrementAttribute)) as PositionIncrementAttribute;
     int position = 0;
     while (stream.IncrementToken())
     {
         int increment = positionIncrement.GetPositionIncrement();
         if(increment>0)
         {
             position = position + increment;
             Console.WriteLine();
             Console.WriteLine("{0}: ", position);
         }
         Console.WriteLine("[{0}]", termAttribute.Term());
     }
     Console.WriteLine();
 }
开发者ID:diegocaxito,项目名称:LuceneTest,代码行数:20,代码来源:AnalyzerUtil.cs

示例10: AssertAnalyzesTo

		public virtual void  AssertAnalyzesTo(Analyzer a, System.String input, System.String[] expectedImages, System.String[] expectedTypes, int[] expectedPosIncrs)
		{
			TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));
			for (int i = 0; i < expectedImages.Length; i++)
			{
				Token t = ts.Next();
				Assert.IsNotNull(t);
				Assert.AreEqual(expectedImages[i], t.TermText());
				if (expectedTypes != null)
				{
					Assert.AreEqual(expectedTypes[i], t.Type());
				}
				if (expectedPosIncrs != null)
				{
					Assert.AreEqual(expectedPosIncrs[i], t.GetPositionIncrement());
				}
			}
			Assert.IsNull(ts.Next());
			ts.Close();
		}
开发者ID:vikasraz,项目名称:indexsearchutils,代码行数:20,代码来源:TestStandardAnalyzer.cs

示例11: Expand

        /// <summary> 
        /// Perform synonym expansion on a query.
        /// </summary>
        /// <param name="query">query</param>
        /// <param name="syns">syns</param>
        /// <param name="a">a</param>
        /// <param name="field">field</param>
        /// <param name="boost">boost</param>
        public static Query Expand(String query, 
            Searcher syns, 
            Analyzer a, 
            String field, 
            float boost)
        {
            already = new List<String>(); // avoid dups
            var top = new List<String>(); // needs to be separately listed..

            var ts = a.TokenStream(field, new StringReader(query));
            var termAtt = ts.AddAttribute<TermAttribute>();

            while (ts.IncrementToken())
            {
                var word = termAtt.Term;

                if (!already.Contains(word))
                {
                    already.Add(word);
                    top.Add(word);
                }
            }

            tmp = new BooleanQuery();

            // [2] form query
            System.Collections.IEnumerator it = top.GetEnumerator();
            while (it.MoveNext())
            {
                // [2a] add to level words in
                var word = (String)it.Current;
                var tq = new TermQuery(new Term(field, word));
                tmp.Add(tq, Occur.SHOULD);

                var c = new CollectorImpl(field, boost);
                syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
            }

            return tmp;
        }
开发者ID:Cefa68000,项目名称:lucenenet,代码行数:48,代码来源:SynLookup.cs

示例12: GetTokenStream

 //convenience method
 public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer)
 {
     return analyzer.TokenStream(field, new StringReader(contents));
 }
开发者ID:Cefa68000,项目名称:lucenenet,代码行数:5,代码来源:TokenSources.cs

示例13: CheckAnalysisConsistency

        private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field)
        {
            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text);
            }

            ICharTermAttribute termAtt;
            IOffsetAttribute offsetAtt;
            IPositionIncrementAttribute posIncAtt;
            IPositionLengthAttribute posLengthAtt;
            ITypeAttribute typeAtt;

            IList<string> tokens = new List<string>();
            IList<string> types = new List<string>();
            IList<int> positions = new List<int>();
            IList<int> positionLengths = new List<int>();
            IList<int> startOffsets = new List<int>();
            IList<int> endOffsets = new List<int>();

            int remainder = random.Next(10);
            StringReader reader = new StringReader(text);

            TokenStream ts;
            using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader))
            {
                 termAtt = ts.HasAttribute<ICharTermAttribute>()
                    ? ts.GetAttribute<ICharTermAttribute>()
                    : null;
                offsetAtt = ts.HasAttribute<IOffsetAttribute>()
                    ? ts.GetAttribute<IOffsetAttribute>()
                    : null;
                posIncAtt = ts.HasAttribute<IPositionIncrementAttribute>()
                    ? ts.GetAttribute<IPositionIncrementAttribute>()
                    : null;
                posLengthAtt = ts.HasAttribute<IPositionLengthAttribute>()
                    ? ts.GetAttribute<IPositionLengthAttribute>()
                    : null;
                typeAtt = ts.HasAttribute<ITypeAttribute>() ? ts.GetAttribute<ITypeAttribute>() : null;

                ts.Reset();

                // First pass: save away "correct" tokens
                while (ts.IncrementToken())
                {
                    Assert.IsNotNull(termAtt, "has no CharTermAttribute");
                    tokens.Add(termAtt.ToString());
                    if (typeAtt != null)
                    {
                        types.Add(typeAtt.Type);
                    }
                    if (posIncAtt != null)
                    {
                        positions.Add(posIncAtt.PositionIncrement);
                    }
                    if (posLengthAtt != null)
                    {
                        positionLengths.Add(posLengthAtt.PositionLength);
                    }
                    if (offsetAtt != null)
                    {
                        startOffsets.Add(offsetAtt.StartOffset());
                        endOffsets.Add(offsetAtt.EndOffset());
                    }
                }
                ts.End();
            }

            // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
            if (tokens.Count > 0)
            {
                // KWTokenizer (for example) can produce a token
                // even when input is length 0:
                if (text.Length != 0)
                {
                    // (Optional) second pass: do something evil:
                    int evilness = random.Next(50);
                    if (evilness == 17)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception");
                        }
                        // Throw an errant exception from the Reader:

                        MockReaderWrapper evilReader = new MockReaderWrapper(random, text);
                        evilReader.ThrowExcAfterChar(random.Next(text.Length));
                        reader = evilReader;

                        try
                        {
                            // NOTE: some Tokenizers go and read characters
                            // when you call .setReader(Reader), eg
                            // PatternTokenizer.  this is a bit
                            // iffy... (really, they should only
                            // pull from the Reader when you call
                            // .incremenToken(), I think?), but we
                            // currently allow it, so, we must call
                            // a.TokenStream inside the try since we may
                            // hit the exc on init:
//.........这里部分代码省略.........
开发者ID:ChristopherHaws,项目名称:lucenenet,代码行数:101,代码来源:BaseTokenStreamTestCase.cs

示例14: CheckResetException

        internal static void CheckResetException(Analyzer a, string input)
        {
            TokenStream ts = a.TokenStream("bogus", new StringReader(input));
            try
            {
                if (ts.IncrementToken())
                {
                    ts.ReflectAsString(false);
                    Assert.Fail("didn't get expected exception when reset() not called");
                }
            }
            catch (InvalidOperationException expected)
            {
                //ok
            }
            catch (AssertionException expected)
            {
                // ok: MockTokenizer
                Assert.IsTrue(expected.Message != null && expected.Message.Contains("wrong state"), expected.Message);
            }
            catch (Exception unexpected)
            {
                //unexpected.printStackTrace(System.err);
                Console.Error.WriteLine(unexpected.StackTrace);
                Assert.Fail("got wrong exception when reset() not called: " + unexpected);
            }
            finally
            {
                // consume correctly
                ts.Reset();
                while (ts.IncrementToken())
                {
                }
                ts.End();
                ts.Dispose();
            }

            // check for a missing Close()
            ts = a.TokenStream("bogus", new StringReader(input));
            ts.Reset();
            while (ts.IncrementToken())
            {
            }
            ts.End();
            try
            {
                ts = a.TokenStream("bogus", new StringReader(input));
                Assert.Fail("didn't get expected exception when Close() not called");
            }
            catch (Exception)
            {
                // ok
            }
            finally
            {
                ts.Dispose();
            }
        }
开发者ID:ChristopherHaws,项目名称:lucenenet,代码行数:58,代码来源:BaseTokenStreamTestCase.cs

示例15: AnalyzeMultitermTerm

        protected internal virtual BytesRef AnalyzeMultitermTerm(string field, string part, Analyzer analyzerIn)
        {
            if (analyzerIn == null) analyzerIn = Analyzer;

            TokenStream source = null;
            try
            {
                source = analyzerIn.TokenStream(field, part);
                source.Reset();

                ITermToBytesRefAttribute termAtt = source.GetAttribute<ITermToBytesRefAttribute>();
                BytesRef bytes = termAtt.BytesRef;

                if (!source.IncrementToken())
                    throw new ArgumentException("analyzer returned no terms for multiTerm term: " + part);
                termAtt.FillBytesRef();
                if (source.IncrementToken())
                    throw new ArgumentException("analyzer returned too many terms for multiTerm term: " + part);
                source.End();
                return BytesRef.DeepCopyOf(bytes);
            }
            catch (IOException e)
            {
                throw new Exception("Error analyzing multiTerm term: " + part, e);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(source);
            }
        }
开发者ID:ChristopherHaws,项目名称:lucenenet,代码行数:30,代码来源:QueryParserBase.cs


注:本文中的Lucene.Net.Analysis.Analyzer.TokenStream方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。