本文整理匯總了Java中org.apache.spark.ml.feature.NGram類的典型用法代碼示例。如果您正苦於以下問題:Java NGram類的具體用法?Java NGram怎麽用?Java NGram使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
NGram類屬於org.apache.spark.ml.feature包,在下文中一共展示了NGram類的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: createNGramDataFrame
import org.apache.spark.ml.feature.NGram; //導入依賴的package包/類
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: encodeFeatures
import org.apache.spark.ml.feature.NGram; //導入依賴的package包/類
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
NGram transformer = getTransformer();
DocumentFeature documentFeature = (DocumentFeature)encoder.getOnlyFeature(transformer.getInputCol());
return Collections.<Feature>singletonList(documentFeature);
}
示例3: getCommonFeatures
import org.apache.spark.ml.feature.NGram; //導入依賴的package包/類
public static DataFrame getCommonFeatures(SQLContext sqlContxt, DataFrame df, String tokenizerOutputCol) {
RegexTokenizer tokenizer = new RegexTokenizer()
.setInputCol("content")
.setOutputCol(tokenizerOutputCol)
.setPattern("[\\s!,.?;'\"]+");
// .setPattern("\\s+");
df = tokenizer.transform(df);
TokenFeaturesExtractor tokenFeatures = new TokenFeaturesExtractor()
.setInputCol(tokenizer.getInputCol())
.setOutputCol("commonfeatures");
df = tokenFeatures.transform(df);
StopWordsRemover remover = new StopWordsRemover()
.setCaseSensitive(false)
.setStopWords(GazetteerContainer.STOP_WORDS.toArray(new String[GazetteerContainer.STOP_WORDS.size()]))
.setInputCol(tokenizer.getOutputCol())
.setOutputCol("filtered");
df = remover.transform(df);
NGram ngramTransformer = new NGram()
.setInputCol(remover.getOutputCol())
.setOutputCol("ngrams");
df = ngramTransformer.transform(df);
return df;
}
示例4: NGramConverter
import org.apache.spark.ml.feature.NGram; //導入依賴的package包/類
public NGramConverter(NGram transformer){
super(transformer);
}