本文整理汇总了Java中org.apache.flink.api.java.DataSet.iterate方法的典型用法代码示例。如果您正苦于以下问题:Java DataSet.iterate方法的具体用法?Java DataSet.iterate怎么用?Java DataSet.iterate使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.flink.api.java.DataSet
的用法示例。
在下文中一共展示了DataSet.iterate方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testBranchAfterIteration
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Test
public void testBranchAfterIteration() {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(DEFAULT_PARALLELISM);
DataSet<Long> sourceA = env.generateSequence(0,1);
IterativeDataSet<Long> loopHead = sourceA.iterate(10);
DataSet<Long> loopTail = loopHead.map(new IdentityMapper<Long>()).name("Mapper");
DataSet<Long> loopRes = loopHead.closeWith(loopTail);
loopRes.output(new DiscardingOutputFormat<Long>());
loopRes.map(new IdentityMapper<Long>())
.output(new DiscardingOutputFormat<Long>());
Plan plan = env.createProgramPlan();
try {
compileNoStats(plan);
}
catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
示例2: testProgram
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Override
protected void testProgram() throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// the test data is constructed such that the merge join zig zag
// has an early out, leaving elements on the dynamic path input unconsumed
DataSet<Path> edges = env.fromElements(
new Path(1, 2),
new Path(1, 4),
new Path(3, 6),
new Path(3, 8),
new Path(1, 10),
new Path(1, 12),
new Path(3, 14),
new Path(3, 16),
new Path(1, 18),
new Path(1, 20));
IterativeDataSet<Path> currentPaths = edges.iterate(10);
DataSet<Path> newPaths = currentPaths
.join(edges, JoinHint.REPARTITION_SORT_MERGE).where("to").equalTo("from")
.with(new PathConnector())
.union(currentPaths).distinct("from", "to");
DataSet<Path> result = currentPaths.closeWith(newPaths);
result.output(new DiscardingOutputFormat<Path>());
env.execute();
}
示例3: doBulkIteration
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
public static DataSet<Tuple2<Long, Long>> doBulkIteration(DataSet<Tuple2<Long, Long>> vertices, DataSet<Tuple2<Long, Long>> edges) {
// open a bulk iteration
IterativeDataSet<Tuple2<Long, Long>> iteration = vertices.iterate(20);
DataSet<Tuple2<Long, Long>> changes = iteration
.join(edges).where(0).equalTo(0).with(new Join222())
.groupBy(0).aggregate(Aggregations.MIN, 1)
.join(iteration).where(0).equalTo(0)
.flatMap(new FlatMapJoin());
// close the bulk iteration
return iteration.closeWith(changes);
}
示例4: testConnectedComponentsWithParametrizableConvergence
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Test
public void testConnectedComponentsWithParametrizableConvergence() throws Exception {
// name of the aggregator that checks for convergence
final String updatedElements = "updated.elements.aggr";
// the iteration stops if less than this number of elements change value
final long convergenceThreshold = 3;
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<Long, Long>> initialSolutionSet = env.fromCollection(verticesInput);
DataSet<Tuple2<Long, Long>> edges = env.fromCollection(edgesInput);
IterativeDataSet<Tuple2<Long, Long>> iteration = initialSolutionSet.iterate(10);
// register the convergence criterion
iteration.registerAggregationConvergenceCriterion(updatedElements,
new LongSumAggregator(), new UpdatedElementsConvergenceCriterion(convergenceThreshold));
DataSet<Tuple2<Long, Long>> verticesWithNewComponents = iteration.join(edges).where(0).equalTo(0)
.with(new NeighborWithComponentIDJoin())
.groupBy(0).min(1);
DataSet<Tuple2<Long, Long>> updatedComponentId =
verticesWithNewComponents.join(iteration).where(0).equalTo(0)
.flatMap(new MinimumIdFilter(updatedElements));
List<Tuple2<Long, Long>> result = iteration.closeWith(updatedComponentId).collect();
Collections.sort(result, new TestBaseUtils.TupleComparator<Tuple2<Long, Long>>());
assertEquals(expectedResult, result);
}
示例5: testAggregatorWithoutParameterForIterate
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Test
public void testAggregatorWithoutParameterForIterate() throws Exception {
/*
* Test aggregator without parameter for iterate
*/
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
DataSet<Integer> initialSolutionSet = CollectionDataSets.getIntegerDataSet(env);
IterativeDataSet<Integer> iteration = initialSolutionSet.iterate(MAX_ITERATIONS);
// register aggregator
LongSumAggregator aggr = new LongSumAggregator();
iteration.registerAggregator(NEGATIVE_ELEMENTS_AGGR, aggr);
// register convergence criterion
iteration.registerAggregationConvergenceCriterion(NEGATIVE_ELEMENTS_AGGR, aggr,
new NegativeElementsConvergenceCriterion());
DataSet<Integer> updatedDs = iteration.map(new SubtractOneMap());
iteration.closeWith(updatedDs).writeAsText(resultPath);
env.execute();
expected = "-3\n" + "-2\n" + "-2\n" + "-1\n" + "-1\n"
+ "-1\n" + "0\n" + "0\n" + "0\n" + "0\n"
+ "1\n" + "1\n" + "1\n" + "1\n" + "1\n";
}
示例6: testMultipleIterations
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Test
public void testMultipleIterations() {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(100);
DataSet<String> input = env.readTextFile(IN_FILE).name("source1");
DataSet<String> reduced = input
.map(new IdentityMapper<String>())
.reduceGroup(new Top1GroupReducer<String>());
IterativeDataSet<String> iteration1 = input.iterate(100);
IterativeDataSet<String> iteration2 = input.iterate(20);
IterativeDataSet<String> iteration3 = input.iterate(17);
iteration1.closeWith(iteration1.map(new IdentityMapper<String>()).withBroadcastSet(reduced, "bc1"))
.output(new DiscardingOutputFormat<String>());
iteration2.closeWith(iteration2.reduceGroup(new Top1GroupReducer<String>()).withBroadcastSet(reduced, "bc2"))
.output(new DiscardingOutputFormat<String>());
iteration3.closeWith(iteration3.reduceGroup(new IdentityGroupReducer<String>()).withBroadcastSet(reduced, "bc3"))
.output(new DiscardingOutputFormat<String>());
Plan plan = env.createProgramPlan();
try{
compileNoStats(plan);
}catch(Exception e){
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
示例7: main
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
// set up execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
final int iterations = params.getInt("iterations", 10);
// make parameters available in the web interface
env.getConfig().setGlobalJobParameters(params);
// get input x data from elements
DataSet<Data> data;
if (params.has("input")) {
// read data from CSV file
data = env.readCsvFile(params.get("input"))
.fieldDelimiter(" ")
.includeFields(true, true)
.pojoType(Data.class);
} else {
System.out.println("Executing LinearRegression example with default input data set.");
System.out.println("Use --input to specify file input.");
data = LinearRegressionData.getDefaultDataDataSet(env);
}
// get the parameters from elements
DataSet<Params> parameters = LinearRegressionData.getDefaultParamsDataSet(env);
// set number of bulk iterations for SGD linear Regression
IterativeDataSet<Params> loop = parameters.iterate(iterations);
DataSet<Params> newParameters = data
// compute a single step using every sample
.map(new SubUpdate()).withBroadcastSet(loop, "parameters")
// sum up all the steps
.reduce(new UpdateAccumulator())
// average the steps and update all parameters
.map(new Update());
// feed new parameters back into next iteration
DataSet<Params> result = loop.closeWith(newParameters);
// emit result
if (params.has("output")) {
result.writeAsText(params.get("output"));
// execute program
env.execute("Linear Regression example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
result.print();
}
}
示例8: main
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
final ParameterTool params = ParameterTool.fromArgs(args);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.getConfig().setGlobalJobParameters(params); // make parameters available in the web interface
double minSupport = params.getDouble("min-support", 0.5);
int iterations = params.getInt("itemset-size", 4);
if (!parametersCorrect(minSupport, iterations)) { return; }
// load the data
DataSet<Tuple2<Integer, Integer>> input = env.readCsvFile(params.getRequired("input"))
.includeFields("11")
.fieldDelimiter("\t")
.lineDelimiter("\n")
.types(Integer.class, Integer.class);
// get the number of distinct transactions
long numberOfTransactions = input
.distinct(0)
.count();
// calculate the number of transactions sufficient for the support threshold
long minNumberOfTransactions = (long) (numberOfTransactions * minSupport);
DataSet<Tuple2<Integer, ArrayList<Integer>>> transactions = input
.groupBy(0)
.reduceGroup(new TransactionGroupReduceFunction());
// compute frequent itemsets for itemset_size = 1
DataSet<ItemSet> c1 = input
// map item to 1
.map(new InputMapFunction())
// group by hashCode of the ItemSet
.groupBy(new ItemSetKeySelector())
// sum the number of transactions containing the ItemSet
.reduce(new ItemSetReduceFunction())
// remove ItemSets with frequency under the support threshold
.filter(new ItemSetFrequencyFilterFunction(minNumberOfTransactions));
// start of the loop
// itemset_size = 2
IterativeDataSet<ItemSet> initial = c1.iterate(iterations - 1);
// create the candidate itemset for the next iteration
DataSet<ItemSet> candidates = initial.cross(c1)
.with(new ItemSetCrossFunction())
.distinct(new ItemSetKeySelector());
// calculate actual numberOfTransactions
DataSet<ItemSet> selected = candidates
.map(new ItemSetCalculateFrequency()).withBroadcastSet(transactions, "transactions")
.filter(new ItemSetFrequencyFilterFunction(minNumberOfTransactions));
// end of the loop
// stop when we run out of iterations or candidates is empty
DataSet<ItemSet> output = initial.closeWith(selected, selected);
if (params.has("output")) {
// write the final solution to file
output.writeAsFormattedText(params.get("output"), new ItemSetTextFormatter());
env.execute("Flink Apriori");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
output.print();
System.out.println("Number of iterations: " + iterations);
System.out.println("Number of transactions: " + numberOfTransactions);
System.out.println("Minimal number of transactions for support threshold of "
+ minSupport + " = " + minNumberOfTransactions);
}
}
示例9: getTestPlanLeftStatic
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
private Plan getTestPlanLeftStatic(String strategy) {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(DEFAULT_PARALLELISM);
@SuppressWarnings("unchecked")
DataSet<Tuple3<Long, Long, Long>> bigInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L),
new Tuple3<Long, Long, Long>(1L, 2L, 3L),new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Big");
@SuppressWarnings("unchecked")
DataSet<Tuple3<Long, Long, Long>> smallInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Small");
IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10);
Configuration joinStrategy = new Configuration();
joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy);
DataSet<Tuple3<Long, Long, Long>> inner = smallInput.join(iteration).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy);
DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner);
output.output(new DiscardingOutputFormat<Tuple3<Long,Long,Long>>());
return env.createProgramPlan();
}
示例10: testProgram
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Override
protected void testProgram() throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8);
IterativeDataSet<Integer> iteration = data.iterate(10);
DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc");
final List<Integer> resultList = new ArrayList<Integer>();
iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList));
env.execute();
Assert.assertEquals(8, resultList.get(0).intValue());
}
示例11: testParameterizableAggregator
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Test
public void testParameterizableAggregator() throws Exception {
final int maxIterations = 5;
final String aggregatorName = "elements.in.component.aggregator";
final long componentId = 1L;
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<Long, Long>> initialSolutionSet = env.fromCollection(verticesInput);
DataSet<Tuple2<Long, Long>> edges = env.fromCollection(edgesInput);
IterativeDataSet<Tuple2<Long, Long>> iteration =
initialSolutionSet.iterate(maxIterations);
// register the aggregator
iteration.registerAggregator(aggregatorName, new LongSumAggregatorWithParameter(componentId));
DataSet<Tuple2<Long, Long>> verticesWithNewComponents = iteration.join(edges).where(0).equalTo(0)
.with(new NeighborWithComponentIDJoin())
.groupBy(0).min(1);
DataSet<Tuple2<Long, Long>> updatedComponentId =
verticesWithNewComponents.join(iteration).where(0).equalTo(0)
.flatMap(new MinimumIdFilterCounting(aggregatorName));
List<Tuple2<Long, Long>> result = iteration.closeWith(updatedComponentId).collect();
Collections.sort(result, new TestBaseUtils.TupleComparator<Tuple2<Long, Long>>());
List<Tuple2<Long, Long>> expectedResult = Arrays.asList(
new Tuple2<>(1L, 1L),
new Tuple2<>(2L, 1L),
new Tuple2<>(3L, 1L),
new Tuple2<>(4L, 1L),
new Tuple2<>(5L, 1L),
new Tuple2<>(6L, 1L),
new Tuple2<>(7L, 7L),
new Tuple2<>(8L, 7L),
new Tuple2<>(9L, 7L)
);
// check program result
assertEquals(expectedResult, result);
// check aggregators
long[] aggrValues = MinimumIdFilterCounting.aggr_value;
// note that position 0 has the end result from superstep 1, retrieved at the start of iteration 2
// position one as superstep 2, retrieved at the start of iteration 3.
// the result from iteration 5 is not available, because no iteration 6 happens
assertEquals(3, aggrValues[0]);
assertEquals(4, aggrValues[1]);
assertEquals(5, aggrValues[2]);
assertEquals(6, aggrValues[3]);
}
示例12: main
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
ParameterTool params = ParameterTool.fromArgs(args);
final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
final int maxIterations = params.getInt("iterations", 10);
// set up execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// make the parameters available to the web ui
env.getConfig().setGlobalJobParameters(params);
// get input data
DataSet<Long> pagesInput = getPagesDataSet(env, params);
DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);
// assign initial rank to pages
DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.
map(new RankAssigner((1.0d / numPages)));
// build adjacency list from link input
DataSet<Tuple2<Long, Long[]>> adjacencyListInput =
linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());
// set iterative data set
IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);
DataSet<Tuple2<Long, Double>> newRanks = iteration
// join pages with outgoing edges and distribute rank
.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
// collect and sum ranks
.groupBy(0).aggregate(SUM, 1)
// apply dampening factor
.map(new Dampener(DAMPENING_FACTOR, numPages));
DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
newRanks,
newRanks.join(iteration).where(0).equalTo(0)
// termination condition
.filter(new EpsilonFilter()));
// emit result
if (params.has("output")) {
finalPageRanks.writeAsCsv(params.get("output"), "\n", " ");
// execute program
env.execute("Basic Page Rank Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
finalPageRanks.print();
}
}
示例13: main
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length < 3) {
throw new IllegalArgumentException("Missing parameters");
}
final String pointsData = args[0];
final String centersData = args[1];
final int numIterations = Integer.parseInt(args[2]);
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.getConfig().disableSysoutLogging();
// get input data
DataSet<Point> points = env.fromElements(pointsData.split("\n"))
.map(new TuplePointConverter());
DataSet<Centroid> centroids = env.fromElements(centersData.split("\n"))
.map(new TupleCentroidConverter());
// set number of bulk iterations for KMeans algorithm
IterativeDataSet<Centroid> loop = centroids.iterate(numIterations);
DataSet<Centroid> newCentroids = points
// compute closest centroid for each point
.map(new SelectNearestCenter()).withBroadcastSet(loop, "centroids")
// count and sum point coordinates for each centroid (test pojo return type)
.map(new CountAppender())
// !test if key expressions are working!
.groupBy("field0").reduce(new CentroidAccumulator())
// compute new centroids from point counts and coordinate sums
.map(new CentroidAverager());
// feed new centroids back into next iteration
DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);
// test that custom data type collects are working
finalCentroids.collect();
}
示例14: testProgram
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
@Override
protected void testProgram() throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4);
DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8);
IterativeDataSet<Integer> iteration = data.iterate(10);
DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc");
final List<Integer> resultList = new ArrayList<Integer>();
iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList));
env.execute();
Assert.assertEquals(8, resultList.get(0).intValue());
}
示例15: main
import org.apache.flink.api.java.DataSet; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
// Checking input parameters
final ParameterTool params = ParameterTool.fromArgs(args);
// set up execution environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.getConfig().setGlobalJobParameters(params); // make parameters available in the web interface
// get input data:
// read the points and centroids from the provided paths or fall back to default data
DataSet<Point> points = getPointDataSet(params, env);
DataSet<Centroid> centroids = getCentroidDataSet(params, env);
// set number of bulk iterations for KMeans algorithm
IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10));
DataSet<Centroid> newCentroids = points
// compute closest centroid for each point
.map(new SelectNearestCenter()).withBroadcastSet(loop, "centroids")
// count and sum point coordinates for each centroid
.map(new CountAppender())
.groupBy(0).reduce(new CentroidAccumulator())
// compute new centroids from point counts and coordinate sums
.map(new CentroidAverager());
// feed new centroids back into next iteration
DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);
DataSet<Tuple2<Integer, Point>> clusteredPoints = points
// assign points to final clusters
.map(new SelectNearestCenter()).withBroadcastSet(finalCentroids, "centroids");
// emit result
if (params.has("output")) {
clusteredPoints.writeAsCsv(params.get("output"), "\n", " ");
// since file sinks are lazy, we trigger the execution explicitly
env.execute("KMeans Example");
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
clusteredPoints.print();
}
}