本文整理匯總了Java中org.apache.spark.ml.feature.NGram.transform方法的典型用法代碼示例。如果您正苦於以下問題:Java NGram.transform方法的具體用法?Java NGram.transform怎麽用?Java NGram.transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.spark.ml.feature.NGram
的用法示例。
在下文中一共展示了NGram.transform方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: createNGramDataFrame
import org.apache.spark.ml.feature.NGram; //導入方法依賴的package包/類
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: getCommonFeatures
import org.apache.spark.ml.feature.NGram; //導入方法依賴的package包/類
public static DataFrame getCommonFeatures(SQLContext sqlContxt, DataFrame df, String tokenizerOutputCol) {
RegexTokenizer tokenizer = new RegexTokenizer()
.setInputCol("content")
.setOutputCol(tokenizerOutputCol)
.setPattern("[\\s!,.?;'\"]+");
// .setPattern("\\s+");
df = tokenizer.transform(df);
TokenFeaturesExtractor tokenFeatures = new TokenFeaturesExtractor()
.setInputCol(tokenizer.getInputCol())
.setOutputCol("commonfeatures");
df = tokenFeatures.transform(df);
StopWordsRemover remover = new StopWordsRemover()
.setCaseSensitive(false)
.setStopWords(GazetteerContainer.STOP_WORDS.toArray(new String[GazetteerContainer.STOP_WORDS.size()]))
.setInputCol(tokenizer.getOutputCol())
.setOutputCol("filtered");
df = remover.transform(df);
NGram ngramTransformer = new NGram()
.setInputCol(remover.getOutputCol())
.setOutputCol("ngrams");
df = ngramTransformer.transform(df);
return df;
}