本文整理汇总了Java中org.apache.spark.ml.feature.NGram类的典型用法代码示例。如果您正苦于以下问题:Java NGram类的具体用法?Java NGram怎么用?Java NGram使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
NGram类属于org.apache.spark.ml.feature包,在下文中一共展示了NGram类的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createNGramDataFrame
import org.apache.spark.ml.feature.NGram; //导入依赖的package包/类
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: encodeFeatures
import org.apache.spark.ml.feature.NGram; //导入依赖的package包/类
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
NGram transformer = getTransformer();
DocumentFeature documentFeature = (DocumentFeature)encoder.getOnlyFeature(transformer.getInputCol());
return Collections.<Feature>singletonList(documentFeature);
}
示例3: getCommonFeatures
import org.apache.spark.ml.feature.NGram; //导入依赖的package包/类
public static DataFrame getCommonFeatures(SQLContext sqlContxt, DataFrame df, String tokenizerOutputCol) {
RegexTokenizer tokenizer = new RegexTokenizer()
.setInputCol("content")
.setOutputCol(tokenizerOutputCol)
.setPattern("[\\s!,.?;'\"]+");
// .setPattern("\\s+");
df = tokenizer.transform(df);
TokenFeaturesExtractor tokenFeatures = new TokenFeaturesExtractor()
.setInputCol(tokenizer.getInputCol())
.setOutputCol("commonfeatures");
df = tokenFeatures.transform(df);
StopWordsRemover remover = new StopWordsRemover()
.setCaseSensitive(false)
.setStopWords(GazetteerContainer.STOP_WORDS.toArray(new String[GazetteerContainer.STOP_WORDS.size()]))
.setInputCol(tokenizer.getOutputCol())
.setOutputCol("filtered");
df = remover.transform(df);
NGram ngramTransformer = new NGram()
.setInputCol(remover.getOutputCol())
.setOutputCol("ngrams");
df = ngramTransformer.transform(df);
return df;
}
示例4: NGramConverter
import org.apache.spark.ml.feature.NGram; //导入依赖的package包/类
public NGramConverter(NGram transformer){
super(transformer);
}