本文整理汇总了Java中org.apache.spark.api.java.JavaPairRDD.map方法的典型用法代码示例。如果您正苦于以下问题:Java JavaPairRDD.map方法的具体用法?Java JavaPairRDD.map怎么用?Java JavaPairRDD.map使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaPairRDD
的用法示例。
在下文中一共展示了JavaPairRDD.map方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext());
JavaPairRDD<String, String> userIdToCityId = jsc.parallelizePairs(
Arrays.asList(new Tuple2<String, String>("1", "101"), new Tuple2<String, String>("2", "102"),
new Tuple2<String, String>("3", "107"), new Tuple2<String, String>("4", "103"),
new Tuple2<String, String>("11", "101"), new Tuple2<String, String>("12", "102"),
new Tuple2<String, String>("13", "107"), new Tuple2<String, String>("14", "103")));
JavaPairRDD<String, String> cityIdToCityName = jsc.parallelizePairs(
Arrays.asList(new Tuple2<String, String>("101", "India"), new Tuple2<String, String>("102", "UK"),
new Tuple2<String, String>("103", "Germany"), new Tuple2<String, String>("107", "USA")));
Broadcast<Map<String, String>> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap());
JavaRDD<Tuple3<String, String, String>> joined = userIdToCityId.map(
v1 -> new Tuple3<String, String, String>(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2())));
System.out.println(joined.collect());
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:26,代码来源:MapSideJoinBroadcast.java
示例2: aggregateScores
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
* Combines {@link Rating}s with the same user/item into one, with score as the sum of
* all of the scores.
*/
private JavaRDD<Rating> aggregateScores(JavaRDD<Rating> original, double epsilon) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
if (implicit) {
// TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
// they don't guarantee the delete elements are properly handled
aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {
// For non-implicit, last wins.
aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
}
JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
aggregated.filter(kv -> !Double.isNaN(kv._2()));
if (logStrength) {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
Math.log1p(userProductScore._2() / epsilon)));
} else {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
userProductScore._2()));
}
}
示例3: run
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public void run() throws IOException {
FileSystem fs = DistributedFileSystem.get(new Configuration());
Path inpath = new Path(input);
Path outpath = new Path(output);
if (!fs.exists(inpath)) {
throw new IllegalArgumentException("Input file not found: " + inpath);
}
if (fs.exists(outpath)) {
throw new IllegalArgumentException("Output file exists, Not overwriting it: " + inpath);
}
SparkConf conf = new SparkConf();
conf.setMaster(sparkMaster);
conf.setAppName(getClass().getSimpleName() + "::" + System.currentTimeMillis());
JavaSparkContext ctx = new JavaSparkContext(conf);
//STEP1: READ
JavaPairRDD<Text, BytesWritable> rdd = ctx.sequenceFile(input, Text.class, BytesWritable.class);
//.mapToPair(rec -> new Tuple2<>(new Text(rec._1()), new BytesWritable(rec._2().getBytes())));
//STEP2: PARSE
JavaPairRDD<Text, Metadata> parsedRDD = rdd.mapToPair(
(PairFunction<Tuple2<Text, BytesWritable>, Text, Metadata>) rec -> {
Metadata md = new Metadata();
try (ByteArrayInputStream stream = new ByteArrayInputStream(rec._2().getBytes())) {
String content = TikaHolder.tika.parseToString(stream, md);
md.add("CONTENT", content);
}
return new Tuple2<>(rec._1(), md);
});
//STEP3: FORMAT
JavaRDD<String> outRDD = parsedRDD.map((Function<Tuple2<Text, Metadata>, String>) rec -> {
String key = rec._1().toString();
Metadata metadata = rec._2();
JSONObject object = new JSONObject();
for (String name : metadata.names()) {
if (metadata.isMultiValued(name)) {
JSONArray arr = new JSONArray();
for (String val : metadata.getValues(name)) {
arr.add(val);
}
object.put(name, arr);
} else {
object.put(name, metadata.get(name));
}
}
return key + "\t\t" + object.toJSONString();
});
//STEP4: SAVE
LOG.info("Saving at " + outpath);
outRDD.saveAsTextFile(output);
LOG.info("Stopping");
ctx.stop();
}
示例4: main
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SamToFastq");
sc = new JavaSparkContext(conf);
String in = args[0];
String out = args[1];
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);
fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例5: main
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new HiveContext(sc.sc());
Options options = new Options();
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
Option queryOpt = new Option( "query", true, "SQL query string." );
Option baminOpt = new Option( "in", true, "" );
options.addOption( opOpt );
options.addOption( queryOpt );
options.addOption( baminOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);
//Read BAM/SAM from HDFS
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));
Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
samDF.registerTempTable(tablename);
if(query!=null) {
//Save as parquet file
Dataset df2 = sqlContext.sql(query);
df2.show(100,false);
if(bwaOutDir!=null)
df2.write().parquet(bwaOutDir);
}else{
if(bwaOutDir!=null)
samDF.write().parquet(bwaOutDir);
}
sc.stop();
}
示例6: calculateSimilarityFromVector
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
* Calculate term similarity from vector.
*
* @param importRDD the {@link org.apache.spark.api.java.JavaPairRDD}
* data structure containing the vectors.
* @param simType the similarity calculation to execute e.g.
* <ul>
* <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_COSINE} - 3,</li>
* <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_HELLINGER} - 2,</li>
* <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_PEARSON} - 1</li>
* </ul>
* @return a new {@link org.apache.spark.api.java.JavaPairRDD}
*/
public static JavaRDD<LinkageTriple> calculateSimilarityFromVector(JavaPairRDD<String, Vector> importRDD, int simType) {
JavaRDD<Tuple2<String, Vector>> importRDD1 = importRDD.map(f -> new Tuple2<String, Vector>(f._1, f._2));
JavaPairRDD<Tuple2<String, Vector>, Tuple2<String, Vector>> cartesianRDD = importRDD1.cartesian(importRDD1);
return cartesianRDD.map(new Function<Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>>, LinkageTriple>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public LinkageTriple call(Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>> arg) {
String keyA = arg._1._1;
String keyB = arg._2._1;
if (keyA.equals(keyB)) {
return null;
}
Vector vecA = arg._1._2;
Vector vecB = arg._2._2;
Double weight = 0.0;
if (simType == SimilarityUtil.SIM_PEARSON) {
weight = SimilarityUtil.pearsonDistance(vecA, vecB);
} else if (simType == SimilarityUtil.SIM_HELLINGER) {
weight = SimilarityUtil.hellingerDistance(vecA, vecB);
}
LinkageTriple triple = new LinkageTriple();
triple.keyA = keyA;
triple.keyB = keyB;
triple.weight = weight;
return triple;
}
}).filter(new Function<LinkageTriple, Boolean>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Boolean call(LinkageTriple arg0) throws Exception {
if (arg0 == null) {
return false;
}
return true;
}
});
}
示例7: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<Vector> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
GaussianMixtureModelSummary gaussianMixtureModelSummary) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
JavaRDD<Vector> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, Vector>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return Vectors.dense(values);
}
}
gaussianMixtureModelSummary.updateSummary(idx, feature);
return Vectors.dense(values);
}
);
Normalizer normalizer = new Normalizer();
JavaRDD<Vector> normed;
if (athenaMLFeatureConfiguration.isNormalization()) {
normed = normalizer.transform(parsedData);
} else {
normed = parsedData;
}
normed.cache();
return normed;
}
示例8: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<Vector> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
KmeansModelSummary kmeansModelSummary) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
// int numberOfTargetValue = 5;
JavaRDD<Vector> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, Vector>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()) {
values[j] = Math.abs(values[j]);
}
}
// values[j] = 0;
}
// //remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return Vectors.dense(values);
}
}
kmeansModelSummary.updateSummary(idx, feature);
return Vectors.dense(values);
}
);
Normalizer normalizer = new Normalizer();
JavaRDD<Vector> normed;
if (athenaMLFeatureConfiguration.isNormalization()) {
normed = normalizer.transform(parsedData);
} else {
normed = parsedData;
}
normed.cache();
return normed;
}
示例9: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
GradientBoostedTreesModelSummary gradientBoostedTreesModelSummary,
Marking marking) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
Normalizer normalizer = new Normalizer();
JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
int label = marking.checkClassificationMarkingElements(idx, feature);
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return new LabeledPoint(label, Vectors.dense(values));
}
}
Vector normedForVal;
if (athenaMLFeatureConfiguration.isNormalization()) {
normedForVal = normalizer.transform(Vectors.dense(values));
} else {
normedForVal = Vectors.dense(values);
}
gradientBoostedTreesModelSummary.updateSummary(idx, feature);
return new LabeledPoint(label, normedForVal);
}
);
return parsedData;
}
示例10: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
RandomForestModelSummary randomForestModelSummary,
Marking marking) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
Normalizer normalizer = new Normalizer();
JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
int label = marking.checkClassificationMarkingElements(idx, feature);
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return new LabeledPoint(label, Vectors.dense(values));
}
}
Vector normedForVal;
if (athenaMLFeatureConfiguration.isNormalization()) {
normedForVal = normalizer.transform(Vectors.dense(values));
} else {
normedForVal = Vectors.dense(values);
}
randomForestModelSummary.updateSummary(idx, feature);
return new LabeledPoint(label, normedForVal);
}
);
return parsedData;
}
示例11: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
SVMModelSummary SVMModelSummary,
Marking marking) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
Normalizer normalizer = new Normalizer();
JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
int label = marking.checkClassificationMarkingElements(idx, feature);
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return new LabeledPoint(label, Vectors.dense(values));
}
}
Vector normedForVal;
if (athenaMLFeatureConfiguration.isNormalization()) {
normedForVal = normalizer.transform(Vectors.dense(values));
} else {
normedForVal = Vectors.dense(values);
}
SVMModelSummary.updateSummary(idx, feature);
return new LabeledPoint(label, normedForVal);
}
);
return parsedData;
}
示例12: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
LogisticRegressionModelSummary logisticRegressionModelSummary,
Marking marking) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
Normalizer normalizer = new Normalizer();
JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
int label = marking.checkClassificationMarkingElements(idx, feature);
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return new LabeledPoint(label, Vectors.dense(values));
}
}
Vector normedForVal;
if (athenaMLFeatureConfiguration.isNormalization()) {
normedForVal = normalizer.transform(Vectors.dense(values));
} else {
normedForVal = Vectors.dense(values);
}
logisticRegressionModelSummary.updateSummary(idx, feature);
return new LabeledPoint(label, normedForVal);
}
);
return parsedData;
}
示例13: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
DecisionTreeModelSummary decisionTreeModelSummary,
Marking marking) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
Normalizer normalizer = new Normalizer();
JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
int label = marking.checkClassificationMarkingElements(idx, feature);
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return new LabeledPoint(label, Vectors.dense(values));
}
}
Vector normedForVal;
if (athenaMLFeatureConfiguration.isNormalization()) {
normedForVal = normalizer.transform(Vectors.dense(values));
} else {
normedForVal = Vectors.dense(values);
}
decisionTreeModelSummary.updateSummary(idx, feature);
return new LabeledPoint(label, normedForVal);
}
);
return parsedData;
}
示例14: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
NaiveBayesModelSummary naiveBayesModelSummary,
Marking marking) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
Normalizer normalizer = new Normalizer();
int numberOfTargetValue = listOfTargetFeatures.size();
JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
int label = marking.checkClassificationMarkingElements(idx, feature);
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return new LabeledPoint(label, Vectors.dense(values));
}
}
Vector normedForVal;
if (athenaMLFeatureConfiguration.isNormalization()) {
normedForVal = normalizer.transform(Vectors.dense(values));
} else {
normedForVal = Vectors.dense(values);
}
naiveBayesModelSummary.updateSummary(idx, feature);
return new LabeledPoint(label, normedForVal);
}
);
return parsedData;
}
示例15: rddPreProcessing
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
RidgeRegressionModelSummary ridgeRegressionModelSummary,
Marking marking) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
Normalizer normalizer = new Normalizer();
JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
int label = marking.checkClassificationMarkingElements(idx, feature);
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()) {
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return new LabeledPoint(label, Vectors.dense(values));
}
}
Vector normedForVal;
if (athenaMLFeatureConfiguration.isNormalization()) {
normedForVal = normalizer.transform(Vectors.dense(values));
} else {
normedForVal = Vectors.dense(values);
}
ridgeRegressionModelSummary.updateSummary(idx, feature);
return new LabeledPoint(label, normedForVal);
}
);
return parsedData;
}