当前位置: 首页>>代码示例>>Java>>正文


Java RegexMatches类代码示例

本文整理汇总了Java中cc.mallet.pipe.tsf.RegexMatches的典型用法代码示例。如果您正苦于以下问题:Java RegexMatches类的具体用法?Java RegexMatches怎么用?Java RegexMatches使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


RegexMatches类属于cc.mallet.pipe.tsf包,在下文中一共展示了RegexMatches类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testMultiTagSerialization

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public static void testMultiTagSerialization () throws IOException, ClassNotFoundException
{
  Pipe origPipe = new SerialPipes (new Pipe[] {
          new SimpleTaggerSentence2TokenSequence (),
          new TokenText (),
          new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
          new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
          new OffsetFeatureConjunction ("time",
                  new String[] { "digits", "ampm" },
                  new int[] { 0, 1 },
                  true),
          new PrintInputAndTarget (),
  });

  Pipe mtPipe = (Pipe) TestSerializable.cloneViaSerialization (origPipe);
  InstanceList mtLst = new InstanceList (mtPipe);
  mtLst.addThruPipe (new ArrayIterator (doc1));
  Instance mtInst = mtLst.get (0);
  TokenSequence mtTs = (TokenSequence) mtInst.getData ();
  assertEquals (6, mtTs.size ());
  assertEquals (1.0, mtTs.get (3).getFeatureValue ("time"), 1e-15);
  assertEquals (1.0, mtTs.get (4).getFeatureValue ("time"), 1e-15);
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:24,代码来源:TestOffsetFeatureConjunctions.java

示例2: addFullTextPipes

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
/** Pipes added based on experience with full text */
private static void addFullTextPipes(List<String> usedPipeNames,
        List<Pipe> pipes) {

    // blabla 24 24
    pipes.add(new LongRegexSpaced("digit_then_other_then_digit", Pattern
            .compile("\\d+[^\\d]+\\d+"), 2, 4));

    // 30 mM K SO , 5 mM MgCl 6H O, 10 mM 24 24 22 HEPES
    pipes.add(new LongRegexSpaced(
            "digit_then_other_then_digit_then_other_then_digit", Pattern
                    .compile(".*\\d+[^\\d\\n]+\\d+[^\\d\\n]+\\d+.*"), 4, 9));

    // n 19
    // n 5
    pipes.add(new LongRegexSpaced("n_space_digit", Pattern
            .compile("n \\d+"), 2, 2));
    pipes.add(new LongRegexSpaced("parenthesis_n_space_digit_parenthesis",
            Pattern.compile("\\( n \\d+ \\)"), 3, 4));
    pipes.add(new LongRegexSpaced("n_space_digit_parenthesis", Pattern
            .compile("n \\d+ \\)"), 3, 4));
    pipes.add(new LongRegexSpaced("parenthesis_n_space_digit", Pattern
            .compile("\\( n \\d+"), 3, 4));

    // Fig is never found in any lexicon
    pipes.add(new RegexMatches("Figure", Pattern.compile(".*Fig.*")));
}
 
开发者ID:BlueBrain,项目名称:bluima,代码行数:28,代码来源:BrainRegionPipes.java

示例3: addPrefixPipes

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public static void addPrefixPipes(List<Pipe> pipes, File file, String name)
        throws IOException {
    for (String line : linesFrom(file.getAbsolutePath())) {
        pipes.add(new RegexMatches(name, compile("(" + line.trim()
                + ".{1,3})", CASE_INSENSITIVE)));
    }
}
 
开发者ID:BlueBrain,项目名称:bluima,代码行数:8,代码来源:BrainRegionPipes.java

示例4: addSubstringRegexPipes

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public static void addSubstringRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Substring regexes");

    // "thalamic" and nuclie are probably in the 1-grams
    for (String substring : new String[] { "cortic", "cerebel" }) {
        pipes.add(new RegexMatches(substring + "Regex", compile(".*"
                + substring + ".*", CASE_INSENSITIVE)));
    }
}
 
开发者ID:BlueBrain,项目名称:bluima,代码行数:11,代码来源:BrainRegionPipes.java

示例5: testMultiTag

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public static void testMultiTag ()
{
  Pipe mtPipe = new SerialPipes (new Pipe[] {
          new SimpleTaggerSentence2TokenSequence (),
          new TokenText (),
          new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
          new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
          new OffsetFeatureConjunction ("time",
                  new String[] { "digits", "ampm" },
                  new int[] { 0, 1 },
                  true),
          new PrintInputAndTarget (),
  });
  Pipe noMtPipe = new SerialPipes (new Pipe[] {
          new SimpleTaggerSentence2TokenSequence (),
          new TokenText (),
          new RegexMatches ("digits", Pattern.compile ("[0-9]+")),
          new RegexMatches ("ampm", Pattern.compile ("[aApP][mM]")),
          new OffsetFeatureConjunction ("time",
                  new String[] { "digits", "ampm" },
                  new int[] { 0, 1 },
                  false),
          new PrintInputAndTarget (),
  });

  InstanceList mtLst = new InstanceList (mtPipe);
  InstanceList noMtLst = new InstanceList (noMtPipe);

  mtLst.addThruPipe (new ArrayIterator (doc1));
  noMtLst.addThruPipe (new ArrayIterator (doc1));

  Instance mtInst = mtLst.get (0);
  Instance noMtInst = noMtLst.get (0);

  TokenSequence mtTs = (TokenSequence) mtInst.getData ();
  TokenSequence noMtTs = (TokenSequence) noMtInst.getData ();

  assertEquals (6, mtTs.size ());
  assertEquals (6, noMtTs.size ());

  assertEquals (1.0, mtTs.get (3).getFeatureValue ("time"), 1e-15);
  assertEquals (1.0, noMtTs.get (3).getFeatureValue ("time"), 1e-15);
  assertEquals (1.0, mtTs.get (4).getFeatureValue ("time"), 1e-15);
  assertEquals (0.0, noMtTs.get (4).getFeatureValue ("time"), 1e-15);
}
 
开发者ID:kostagiolasn,项目名称:NucleosomePatternClassifier,代码行数:46,代码来源:TestOffsetFeatureConjunctions.java

示例6: createDefaultPipes

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public static SerialPipes createDefaultPipes(Alphabet dataAlphabet, Alphabet targetAlphabet) {
	List<Pipe> pipes = new ArrayList<Pipe>();
	pipes.add(new TokenText());
	pipes.add(new TokenTextCharPrefix("PREFIX=", 2));
	pipes.add(new TokenTextCharPrefix("PREFIX=", 3));
	pipes.add(new TokenTextCharSuffix("SUFFIX=", 2));
	pipes.add(new TokenTextCharSuffix("SUFFIX=", 3));
	pipes.add(new TokenTextCharNGrams("NGRAM=", new int[] { 2, 3 }));
	pipes.add(new RegexMatches("ALL_CAPS_REGEX", Pattern.compile(TextUtil.ALL_CAPS_REGEX)));
	pipes.add(new RegexMatches("ALPHA_NUMERIC_REGEX", Pattern.compile(TextUtil.ALPHA_NUMERIC_REGEX)));
	pipes.add(new RegexMatches("CAPS_MIX_REGEX", Pattern.compile(TextUtil.CAPS_MIX_REGEX)));
	pipes.add(new RegexMatches("EMAIL_REGEX", Pattern.compile(TextUtil.EMAIL_REGEX)));
	pipes.add(new RegexMatches("END_DASH_REGEX", Pattern.compile(TextUtil.END_DASH_REGEX)));
	pipes.add(new RegexMatches("EXP_NUMBER_REGEX", Pattern.compile(TextUtil.EXP_NUMBER_REGEX)));
	pipes.add(new RegexMatches("FLOATING_POINT_NUMBER_REGEX", Pattern.compile(TextUtil.FLOATING_POINT_NUMBER_REGEX)));
	pipes.add(new RegexMatches("FOUR_CAPS_REGEX", Pattern.compile(TextUtil.FOUR_CAPS_REGEX)));
	pipes.add(new RegexMatches("FOUR_DIGITS_REGEX", Pattern.compile(TextUtil.FOUR_DIGITS_REGEX)));
	pipes.add(new RegexMatches("HAS_DASH_REGEX", Pattern.compile(TextUtil.HAS_DASH_REGEX)));
	pipes.add(new RegexMatches("HAS_DIGIT_REGEX", Pattern.compile(TextUtil.HAS_DIGIT_REGEX)));
	pipes.add(new RegexMatches("HEX_REGEX", Pattern.compile(TextUtil.HEX_REGEX)));
	pipes.add(new RegexMatches("HTML_REGEX", Pattern.compile(TextUtil.HTML_REGEX)));
	pipes.add(new RegexMatches("IN_PARENTHESES_REGEX", Pattern.compile(TextUtil.IN_PARENTHESES_REGEX)));
	pipes.add(new RegexMatches("INIT_CAPS_ALPHA_REGEX", Pattern.compile(TextUtil.INIT_CAPS_ALPHA_REGEX)));
	pipes.add(new RegexMatches("INIT_CAPS_REGEX", Pattern.compile(TextUtil.INIT_CAPS_REGEX)));
	pipes.add(new RegexMatches("INIT_DASH_REGEX", Pattern.compile(TextUtil.INIT_DASH_REGEX)));
	pipes.add(new RegexMatches("IP_REGEX", Pattern.compile(TextUtil.IP_REGEX)));
	pipes.add(new RegexMatches("NEGATIVE_INTEGER_REGEX", Pattern.compile(TextUtil.NEGATIVE_INTEGER_REGEX)));
	pipes.add(new RegexMatches("ONE_CAP_REGEX", Pattern.compile(TextUtil.ONE_CAP_REGEX)));
	pipes.add(new RegexMatches("ONE_DIGIT_REGEX", Pattern.compile(TextUtil.ONE_DIGIT_REGEX)));
	pipes.add(new RegexMatches("POSITIVE_INTEGER_REGEX", Pattern.compile(TextUtil.POSITIVE_INTEGER_REGEX)));
	pipes.add(new RegexMatches("PUNCTUATION_REGEX", Pattern.compile(TextUtil.PUNCTUATION_REGEX)));
	pipes.add(new RegexMatches("ROMAN_NUMBER_CAPITAL_REGEX", Pattern.compile(TextUtil.ROMAN_NUMBER_CAPITAL_REGEX)));
	pipes.add(new RegexMatches("ROMAN_NUMBER_SMALL_REGEX", Pattern.compile(TextUtil.ROMAN_NUMBER_SMALL_REGEX)));
	pipes.add(new RegexMatches("SINGLE_INITIAL_REGEX", Pattern.compile(TextUtil.SINGLE_INITIAL_REGEX)));
	pipes.add(new RegexMatches("THREE_CAPS_REGEX", Pattern.compile(TextUtil.THREE_CAPS_REGEX)));
	pipes.add(new RegexMatches("THREE_DIGITS_REGEX", Pattern.compile(TextUtil.THREE_DIGITS_REGEX)));
	pipes.add(new RegexMatches("TWO_CAPS_REGEX", Pattern.compile(TextUtil.TWO_CAPS_REGEX)));
	pipes.add(new RegexMatches("TWO_DIGITS_REGEX", Pattern.compile(TextUtil.TWO_DIGITS_REGEX)));
	pipes.add(new RegexMatches("URL_REGEX", Pattern.compile(TextUtil.URL_REGEX)));
	pipes.add(new RegexMatches("YEAR_REGEX", Pattern.compile(TextUtil.YEAR_REGEX)));
	pipes.add(new RegexMatches("OBD_REGEX", Pattern.compile(TextUtil.OBD_REGEX)));
	pipes.add(new RegexMatches("ONE_QUESTION_MARK_REGEX", Pattern.compile(TextUtil.ONE_QUESTION_MARK_REGEX)));
	pipes.add(new RegexMatches("TWO_QUESTION_MARKS_REGEX", Pattern.compile(TextUtil.TWO_QUESTION_MARKS_REGEX)));
	pipes.add(new RegexMatches("THREE_QUESTION_MARKS_REGEX", Pattern.compile(TextUtil.THREE_QUESTION_MARKS_REGEX)));
	pipes.add(new RegexMatches("MULTIPLE_QUESTION_MARKS_REGEX", Pattern
			.compile(TextUtil.MULTIPLE_QUESTION_MARKS_REGEX)));
	pipes.add(new RegexMatches("ONE_EXCLAMATION_MARK_REGEX", Pattern.compile(TextUtil.ONE_EXCLAMATION_MARK_REGEX)));
	pipes.add(new RegexMatches("TWO_EXCLAMATION_MARKS_REGEX", Pattern.compile(TextUtil.TWO_EXCLAMATION_MARKS_REGEX)));
	pipes.add(new RegexMatches("THREE_EXCLAMATION_MARKS_REGEX", Pattern
			.compile(TextUtil.THREE_EXCLAMATION_MARKS_REGEX)));
	pipes.add(new RegexMatches("MULTIPLE_EXCLAMATION_MARKS_REGEX", Pattern
			.compile(TextUtil.MULTIPLE_EXCLAMATION_MARKS_REGEX)));
	pipes.add(new RegexMatches("QUESTION_EXCLAMATION_MARK_REGEX", Pattern
			.compile(TextUtil.QUESTION_EXCLAMATION_MARK_REGEX)));
	pipes.add(new RegexMatches("EXCLAMATION_QUESTION_MARK_REGEX", Pattern
			.compile(TextUtil.EXCLAMATION_QUESTION_MARK_REGEX)));
	pipes.add(new OffsetConjunctions(new int[][] { { -1 }, { 1 } }));
	pipes.add(new TokenSequence2FeatureVectorSequence(targetAlphabet));
	SerialPipes serialPipes = new SerialPipes(pipes);
	serialPipes.setDataAlphabet(dataAlphabet);
	serialPipes.setTargetAlphabet(targetAlphabet);
	serialPipes.setTargetProcessing(true);
	return serialPipes;
}
 
开发者ID:jdmp,项目名称:java-data-mining-package,代码行数:65,代码来源:MalletUtil.java

示例7: NEPipes

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public NEPipes() {
    super(
            new Pipe[] {
                    //new TokenText( "text=" ),

                    new RegexMatches( "SingleLetter", Pattern.compile( "[A-Za-z]" ) ),
                    new RegexMatches( "AllCaps", Pattern.compile( ALLCAPS ) ),
                    new RegexMatches( "AllLower", Pattern.compile( ALLLOWER ) ),
                    new RegexMatches( "InitCaps", Pattern.compile( INITCAPS ) ),
                    new RegexMatches( "MixedCase", Pattern.compile( MIXEDCASE ) ),
                    new RegexMatches( "MixedNum", Pattern.compile( MIXEDNUM ) ),
                    new RegexMatches( "EndSentPunc", Pattern.compile( ENDSENTENCE ) ),
                    new RegexMatches( "Punc", Pattern.compile( PUNCTUATION ) ),
                    new RegexMatches( "Bracket", Pattern.compile( BRACKET ) ),
                    new RegexMatches( "Ordinal", Pattern.compile( ORDINAL, Pattern.CASE_INSENSITIVE ) ),

                    new LongRegexMatches( "Quoted", Pattern.compile( QUOTED ), 1, 4 ),
                    new LongRegexMatches( "Bracketed", Pattern.compile( BRACKETED ), 1, 4 ),
                    new LongRegexMatches( "Initial", Pattern.compile( INITIAL ), 2, 2 ),
                    new LongRegexMatches( "Ellipse", Pattern.compile( DOTS ), 1, 2 ),
                    new LongRegexMatches( "Dashes", Pattern.compile( DASHES ), 2, 2 ),
                    new LongRegexMatches( "Fraction", Pattern.compile( FRACTION ), 1, 3 ),
                    new LongRegexMatches( "DotDecimal", Pattern.compile( DOTDECIMAL ), 1, 3 ),

                    new LongRegexMatches( "Percent", Pattern.compile( "(" + RANGE + "|" + DECIMAL + ")%" ), 2, 4 ),
                    new RegexMatches( "10^3n", Pattern.compile( ILLION, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Numeric", Pattern.compile( DECIMAL ), 1, 3 ),
                    new LongRegexMatches( "BigNumber", Pattern.compile( COMMA_DECIMAL ), 1, 7 ),
                    new LongRegexMatches( "kmbNumber",
                            Pattern.compile( DECIMAL + ILLION, Pattern.CASE_INSENSITIVE ), 1, 4 ),
                    new RegexMatches( "kmbMixed", Pattern.compile( MIXED_ILLION, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Dollars", Pattern.compile( "[$](" + RANGE + "|" + DECIMAL + "|"
                            + COMMA_DECIMAL + "|" + DECIMAL + ILLION + "|" + MIXED_ILLION + ")",
                            Pattern.CASE_INSENSITIVE ), 2, 8 ),

                    new RegexMatches( "NumberWord", Pattern.compile( NUMBER_WORD, Pattern.CASE_INSENSITIVE ) ),
                   //FIXME useful beyond this?
                    new RegexMatches( "Currency", Pattern.compile( CURRENCY, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "MoneyWords", Pattern.compile( MONEYWORDS, Pattern.CASE_INSENSITIVE ), 2,
                            4 ),

                    new LongRegexMatches( "AmPm", Pattern.compile( AMPM, Pattern.CASE_INSENSITIVE ), 1, 4 ),
                    new RegexMatches( "MixedAmPm", Pattern.compile( MIXED_AMPM, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "TimeNum", Pattern.compile( TIMENUM ), 3, 5 ),
                    new RegexMatches( "TimeZone", Pattern.compile( TIMEZONES, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Time", Pattern.compile( TIME, Pattern.CASE_INSENSITIVE ), 1, 9 ),
                    new LongRegexMatches( "TimeRange", Pattern.compile( TIMERANGE, Pattern.CASE_INSENSITIVE ), 3,
                            19 ),

                    new LongRegexMatches( "P10", Pattern.compile( P10 ), 3, 7 ),
                    new LongRegexMatches( "P5", Pattern.compile( P10 ), 3, 3 ),
                    new LongRegexMatches( "Phone", Pattern.compile( P10 + "|" + P5 ), 3, 7 ),

                    new RegexMatches( "UncasedMonthName", Pattern.compile( MONTHNAME, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "UncasedMonthAbbr",
                            Pattern.compile( MONTHABBR, Pattern.CASE_INSENSITIVE ), 1, 2 ),
                    new LongRegexMatches( "CasedMonth", Pattern.compile( MONTH ), 1, 2 ),
                    new LongRegexMatches( "UncasedMonth", Pattern.compile( MONTH, Pattern.CASE_INSENSITIVE ), 1, 2 ),

                    new RegexMatches( "UncasedWeekdayName", Pattern.compile( WEEKDAYNAME, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "UncasedWeekdayAbbr", Pattern.compile( WEEKDAYABBR,
                            Pattern.CASE_INSENSITIVE ), 1, 2 ),
                    new LongRegexMatches( "CasedWeekday", Pattern.compile( WEEKDAY ), 1, 2 ),
                    new LongRegexMatches( "UncasedWeekday", Pattern.compile( WEEKDAY, Pattern.CASE_INSENSITIVE ),
                            1, 2 ),

                    new LongRegexMatches( "MonthDay", Pattern.compile( MONTHDAY, Pattern.CASE_INSENSITIVE ), 2, 3 ),
                    new LongRegexMatches( "DayMonthDay", Pattern.compile( DAYMONTHDAY, Pattern.CASE_INSENSITIVE ),
                            3, 6 ),
                    new LongRegexMatches( "MonthYear", Pattern.compile( MONTHYEAR, Pattern.CASE_INSENSITIVE ), 2, 4 ),
                    new LongRegexMatches( "MonthDayYear",
                            Pattern.compile( MONTHDAYYEAR, Pattern.CASE_INSENSITIVE ), 3, 5 ),
                    new LongRegexMatches( "DayMonthDayYear", Pattern.compile( DAYMONTHDAYYEAR,
                            Pattern.CASE_INSENSITIVE ), 4, 8 ),

                    new LongRegexMatches( "SeparatorDate", Pattern.compile( SEPDATE ), 3, 5 ),
                    new LongRegexMatches( "FullSeparatorDate", Pattern.compile( FULLSEPDATE ), 5, 5 ),
            } );
}
 
开发者ID:BlueBrain,项目名称:bluima,代码行数:80,代码来源:NEPipes.java

示例8: TrainCRF

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public TrainCRF(String trainingFilename, String testingFilename) throws IOException {

        ArrayList<Pipe> pipes = new ArrayList<Pipe>();

        int[][] conjunctions = new int[2][];
        conjunctions[0] = new int[] { -1 };
        conjunctions[1] = new int[] { 1 };

        pipes.add(new SimpleTaggerSentence2TokenSequence());
        pipes.add(new OffsetConjunctions(conjunctions));
        //pipes.add(new FeaturesInWindow("PREV-", -1, 1));
        pipes.add(new TokenTextCharSuffix("C1=", 1));
        pipes.add(new TokenTextCharSuffix("C2=", 2));
        pipes.add(new TokenTextCharSuffix("C3=", 3));
        pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
        pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
        pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
        pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*")));
        pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
        pipes.add(new TokenSequence2FeatureVectorSequence());

        Pipe pipe = new SerialPipes(pipes);

        InstanceList trainingInstances = new InstanceList(pipe);
        InstanceList testingInstances = new InstanceList(pipe);

        trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
        testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));

        CRF crf = new CRF(pipe, null);
        //crf.addStatesForLabelsConnectedAsIn(trainingInstances);
        crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingInstances);
        crf.addStartState();

        CRFTrainerByLabelLikelihood trainer =
                new CRFTrainerByLabelLikelihood(crf);
        trainer.setGaussianPriorVariance(10.0);

        //CRFTrainerByStochasticGradient trainer =
        //new CRFTrainerByStochasticGradient(crf, 1.0);

        //CRFTrainerByL1LabelLikelihood trainer =
        //	new CRFTrainerByL1LabelLikelihood(crf, 0.75);

        //trainer.addEvaluator(new PerClassAccuracyEvaluator(trainingInstances, "training"));
        trainer.addEvaluator(new PerClassAccuracyEvaluator(testingInstances, "testing"));
        trainer.addEvaluator(new TokenAccuracyEvaluator(testingInstances, "testing"));
        trainer.train(trainingInstances);

    }
 
开发者ID:karahindiba,项目名称:WikiInfoboxExtractor,代码行数:51,代码来源:TrainCRF.java

示例9: TrainWikiCRF

import cc.mallet.pipe.tsf.RegexMatches; //导入依赖的package包/类
public TrainWikiCRF(String trainingFilename, String testingFilename) throws IOException {
	
	ArrayList<Pipe> pipes = new ArrayList<Pipe>();

	int[][] conjunctions = new int[2][];
	conjunctions[0] = new int[] { -1 };
	conjunctions[1] = new int[] { 1 };

	pipes.add(new SimpleTaggerSentence2TokenSequence());
	pipes.add(new OffsetConjunctions(conjunctions));
	//pipes.add(new FeaturesInWindow("PREV-", -1, 1));
	pipes.add(new TokenTextCharSuffix("C1=", 1));
	pipes.add(new TokenTextCharSuffix("C2=", 2));
	pipes.add(new TokenTextCharSuffix("C3=", 3));
	pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
	pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
	pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
	pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile(".*\\$.*")));
	pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
	pipes.add(new TokenSequence2FeatureVectorSequence());

	Pipe pipe = new SerialPipes(pipes);

	InstanceList trainingInstances = new InstanceList(pipe);
	InstanceList testingInstances = new InstanceList(pipe);

	trainingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true));
	testingInstances.addThruPipe(new LineGroupIterator(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(testingFilename)))), Pattern.compile("^\\s*$"), true));
	
	CRF crf = new CRF(pipe, null);
	//crf.addStatesForLabelsConnectedAsIn(trainingInstances);
	crf.addStatesForThreeQuarterLabelsConnectedAsIn(trainingInstances);
	crf.addStartState();

	CRFTrainerByLabelLikelihood trainer = 
		new CRFTrainerByLabelLikelihood(crf);
	trainer.setGaussianPriorVariance(10.0);

	//CRFTrainerByStochasticGradient trainer = 
	//new CRFTrainerByStochasticGradient(crf, 1.0);

	//CRFTrainerByL1LabelLikelihood trainer = 
	//	new CRFTrainerByL1LabelLikelihood(crf, 0.75);

	//trainer.addEvaluator(new PerClassAccuracyEvaluator(trainingInstances, "training"));
	trainer.addEvaluator(new PerClassAccuracyEvaluator(testingInstances, "testing"));
	trainer.addEvaluator(new TokenAccuracyEvaluator(testingInstances, "testing"));
	trainer.train(trainingInstances);
	
}
 
开发者ID:karahindiba,项目名称:WikiInfoboxExtractor,代码行数:51,代码来源:TrainWikiCRF.java


注:本文中的cc.mallet.pipe.tsf.RegexMatches类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。