本文整理汇总了Java中org.apache.flink.api.java.operators.DataSource.flatMap方法的典型用法代码示例。如果您正苦于以下问题:Java DataSource.flatMap方法的具体用法?Java DataSource.flatMap怎么用?Java DataSource.flatMap使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.flink.api.java.operators.DataSource
的用法示例。
在下文中一共展示了DataSource.flatMap方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: start
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void start( MachineLearningDefinienListConfig config ){
LOG.info("Start machine learning approach for listing identifier-definien pairs");
// first, create a flink environment
ExecutionEnvironment flinkEnv = ExecutionEnvironment.getExecutionEnvironment();
flinkEnv.setParallelism( config.getParallelism() );
LOG.debug("Read wikidump via flink");
DataSource<String> dataSource = FlinkMlpRelationFinder.readWikiDump( config, flinkEnv );
LOG.debug("Parse documents via flink");
FlatMapOperator<String, RawWikiDocument> mapOperator = dataSource.flatMap(new TextExtractorMapper());
LOG.debug("Open text annotator mapper");
TextAnnotatorMapper annotatorMapper = new TextAnnotatorMapper(config);
// ML approach doesn't create PosTagger here ... strange, so I will use it now.
annotatorMapper.open(null);
DataSet<ParsedWikiDocument> parsedDocuments = mapOperator.map( annotatorMapper );
LOG.debug("Create feature Extractor without Gouldi");
CreateCandidatesMapper candidatesMapper = new CreateCandidatesMapper(config);
DataSet<WikiDocumentOutput> outputDataSet = parsedDocuments.map( candidatesMapper );
LOG.debug("Map to output format.");
RelationMapper outputMapper = new RelationMapper();
DataSet<LinkedList<String[]>> outputs = outputDataSet.map(outputMapper);
Path outputPath = Paths.get(config.getOutputDir(), OUTPUT_FILE_NAME);
LOG.info("Write output file " + outputPath.toString() );
outputs.writeAsFormattedText(
outputPath.toString(),
FileSystem.WriteMode.OVERWRITE,
new OutputFormatter()
).setParallelism(1);
try {
flinkEnv.execute();
} catch (Exception e) {
LOG.error("Error due execution of flink process.", e);
}
}
示例2: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> inputNodesAndValue = env.readTextFile(argPathToNodesAndValues);
DataSource<String> inputIndex = env.readTextFile(argPathToIndex);
DataSet<Tuple2<String, Long>> nodes = inputIndex
.flatMap(new NodeReader());
/* Convert the input as (node, value) */
DataSet<Tuple2<Long, Double>> nodesAndValue = inputNodesAndValue.flatMap(new ValueReader());
// Output 1, ID, degree for group by
DataSet<Tuple3<Long, Long, Double>> topKMapper = nodesAndValue
.flatMap(new TopKMapper());
// Get topK
DataSet<Tuple3<Long, Long, Double>> topKReducer = topKMapper.groupBy(0)
.sortGroup(2, Order.DESCENDING).first(topK);
// Node ID joins with node's name
DataSet<Tuple2<String, Double>> topKwithName = topKReducer.join(nodes)
.where(1).equalTo(1).flatMap(new ProjectNodeWithName());
topKwithName.writeAsCsv(argPathOut, WriteMode.OVERWRITE);
env.execute();
}
示例3: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> inputArc = env
.readTextFile(argPathToArc);
/* Convert the input to arcs, consisting of (source, target) */
DataSet<Tuple2<Long, Long>> arcs = inputArc.flatMap(new ArcReader());
DataSet<Tuple3<Long, Long, Double>> srcIncMat = arcs.map(
new SourceIncMatrix()).name("S(G)");
srcIncMat.writeAsCsv(argPathOut, "\n", "\t",
FileSystem.WriteMode.OVERWRITE);
env.execute();
}
示例4: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> input = env.readTextFile(argPathToArc);
/* Convert the input to edges, consisting of (source, target) */
DataSet<Tuple2<Long, Long>> edges = input.flatMap(new EdgeReader());
/* Create a dataset of all vertex ids and count them */
DataSet<Long> numVertices = edges.<Tuple1<Long>> project(0)
.union(edges.<Tuple1<Long>> project(1)).distinct()
.reduceGroup(new CountVertices());
/* Compute the degree of every vertex */
DataSet<Tuple2<Long, Long>> verticesWithDegree = edges
.<Tuple1<Long>> project(0)
// difference of out-degree and in-degree is project(0), group
// by source
.groupBy(0).reduceGroup(new DegreeOfVertex());
/* Compute the degree distribution */
DataSet<Tuple2<Long, Double>> degreeDistribution = verticesWithDegree
.groupBy(1).reduceGroup(new DistributionElement())
.withBroadcastSet(numVertices, "numVertices");
degreeDistribution.writeAsCsv(argPathOut,
FileSystem.WriteMode.OVERWRITE);
env.execute();
}
示例5: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> input = env.readTextFile(argPathToArc);
/* Convert the input to edges, consisting of (source, target) */
DataSet<Tuple2<Long, Long>> edges = input.flatMap(new EdgeReader());
/* Create a dataset of all vertex ids and count them */
DataSet<Long> numVertices = edges.<Tuple1<Long>>project(0)
.union(edges.<Tuple1<Long>>project(1)).distinct()
.reduceGroup(new CountVertices());
/* Compute the degree of every vertex */
DataSet<Tuple2<Long, Long>> verticesWithDegree = edges.<Tuple1<Long>>project(1)
// difference of out-degree and in-degree is project(1), group by target
.groupBy(0).reduceGroup(new DegreeOfVertex());
/* Compute the degree distribution */
DataSet<Tuple2<Long, Double>> degreeDistribution = verticesWithDegree
.groupBy(1).reduceGroup(new DistributionElement())
.withBroadcastSet(numVertices, "numVertices");
degreeDistribution.writeAsCsv(argPathOut,
FileSystem.WriteMode.OVERWRITE);
env.execute();
}
示例6: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSource<String> input = env.readTextFile(Config.pathToSlashdotZoo());
/* Convert the input to edges, consisting of (source, target, isFriend ) */
DataSet<Tuple3<Long, Long, Boolean>> edges = input.flatMap(new EdgeReader());
/* Create a dataset of all vertex ids and count them */
DataSet<Long> numVertices =
edges.project(0).types(Long.class)
.union(edges.project(1).types(Long.class))
.distinct().reduceGroup(new CountVertices());
/* Compute the degree of every vertex */
DataSet<Tuple2<Long, Long>> verticesWithDegree =
edges.project(0).types(Long.class)
.groupBy(0).reduceGroup(new DegreeOfVertex());
/* Compute the degree distribution */
DataSet<Tuple2<Long, Double>> degreeDistribution =
verticesWithDegree.groupBy(1).reduceGroup(new DistributionElement())
.withBroadcastSet(numVertices, "numVertices");
degreeDistribution.writeAsText(Config.outputPath(), FileSystem.WriteMode.OVERWRITE);
env.execute();
}
示例7: extractRedirectMappings
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static DataSet<RedirectMapping> extractRedirectMappings(ExecutionEnvironment env, DataSource<String> wikiDump) {
return wikiDump.flatMap(new FlatMapFunction<String, RedirectMapping>() {
@Override
public void flatMap(String content, Collector<RedirectMapping> out) throws Exception {
Pattern pattern = Pattern.compile(REGEX, Pattern.DOTALL);
Matcher m = pattern.matcher(content);
// if the record does not contain parsable page-xml
if (!m.find()) return;
// otherwise create a WikiDocument object from the xml
WikiDocument doc = new WikiDocument();
doc.setId(Integer.parseInt(m.group(3)));
doc.setTitle(WikiSimStringUtils.unescapeEntities(m.group(1)));
doc.setNS(Integer.parseInt(m.group(2)));
if (doc.getNS() != 0) return;
Pattern redirect = Pattern.compile("<redirect title=\"(.+?)\"", Pattern.CASE_INSENSITIVE);
Matcher mr = redirect.matcher(content);
if (!mr.find()) return;
out.collect(new RedirectMapping(
doc.getTitle(),
WikiSimStringUtils.unescapeEntities(mr.group(1))
));
}
});
}
示例8: extractIdTitleMapping
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static DataSet<IdTitleMapping> extractIdTitleMapping(ExecutionEnvironment env, DataSource<String> wikiDump) {
return wikiDump.flatMap(new FlatMapFunction<String, IdTitleMapping>() {
@Override
public void flatMap(String s, Collector<IdTitleMapping> out) throws Exception {
DocumentProcessor dp = new DocumentProcessor();
WikiDocument doc = dp.processDoc(s);
if(doc != null) {
out.collect(new IdTitleMapping(doc.getId(), doc.getTitle()));
}
}
});
}
示例9: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> inputArc = env
.readTextFile(argPathToArc);
/* Convert the input to arcs, consisting of (source, target) */
DataSet<Tuple2<Long, Long>> arcs = inputArc.flatMap(new ArcReader());
DataSet<Tuple3<Long, Long, Double>> tarIncMat = arcs.map(
new TargetIncMatrix()).name("T(G)");
tarIncMat.writeAsCsv(argPathOut, "\n", "\t",
FileSystem.WriteMode.OVERWRITE);
env.execute();
}
示例10: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String... args) throws Exception {
if (!parseParameters(args)) {
return;
}
// set up execution environment
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> inputArc = env
.readTextFile(edgesPath);
DataSource<String> inputIndex = env.readTextFile(verticesPath);
DataSet<Long> vertices = inputIndex.flatMap(new NodeReader());
/* Convert the input to edges, consisting of (source, target) */
DataSet<Tuple2<Long, Long>> arcs = inputArc.flatMap(new ArcReader());
// Undirected graph (arc becomes edge)
DataSet<Tuple2<Long, Long>> edges = arcs.flatMap(new UndirectEdge())
.distinct();
// assign the initial components (equal to the vertex id)
DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices
.map(new DuplicateValue<Long>());
// Open a delta iteration
DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithInitialId
.iterateDelta(verticesWithInitialId, maxIterations, 0);
// Apply the step logic: join with the edges, select the minimum
// neighbor, update if the component of the candidate is smaller
DataSet<Tuple2<Long, Long>> changes = iteration.getWorkset()
.join(edges).where(0).equalTo(0)
.with(new NeighborWithComponentIDJoin()).groupBy(0)
.aggregate(Aggregations.MIN, 1)
.join(iteration.getSolutionSet()).where(0).equalTo(0)
.with(new ComponentIdFilter());
// close the delta iteration (delta and new workset are identical)
DataSet<Tuple2<Long, Long>> vertexWithComponentID = iteration
.closeWith(changes, changes);
// Size of Component
DataSet<Long> numComponent = vertexWithComponentID.<Tuple1<Long>>project(1).distinct().reduceGroup(new CountComponent());
/* Compute the size of every component, emit (Component size, 1) */
DataSet<Tuple2<Long, Long>> ComponentCount = vertexWithComponentID
.<Tuple1<Long>>project(1).groupBy(0)
.reduceGroup(new ComponentCount()).flatMap(new ComponentMap());
DataSet<Tuple2<Long, Long>> ComponentDistribution = ComponentCount
.groupBy(0).aggregate(Aggregations.SUM, 1);
// Emit result
if (fileOutput) {
ComponentDistribution.writeAsCsv(outputPath, "\n", " ",
FileSystem.WriteMode.OVERWRITE);
//numComponent.print();
} else {
numComponent.print();
ComponentDistribution.print();
}
env.execute("Weakly Connected Components");
}
示例11: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> inputArc = env.readTextFile(argPathToArc);
DataSource<String> inputIndex = env.readTextFile(argPathToIndex);
DataSet<Tuple2<String, Long>> nodes = inputIndex
.flatMap(new NodeReader());
/* Convert the input to edges, consisting of (source, target) */
DataSet<Tuple2<Long, Long>> arcs = inputArc.flatMap(new ArcReader());
/* Compute the degree of every vertex */
DataSet<Tuple2<Long, Long>> verticesWithDegree = arcs
.<Tuple1<Long>> project(0).groupBy(0)
.reduceGroup(new DegreeOfVertex());
// Focus on the nodes' degree higher than average degree
DataSet<Tuple2<Long, Long>> highOutDegree = verticesWithDegree
.filter(new DegreeFilter());
// Output 1, ID, degree for group by
DataSet<Tuple3<Long, Long, Long>> topKMapper = highOutDegree
.flatMap(new TopKMapper());
// Get topK
DataSet<Tuple3<Long, Long, Long>> topKReducer = topKMapper.groupBy(0)
.sortGroup(2, Order.DESCENDING).first(topK);
// Node ID joins with node's name
DataSet<Tuple2<String, Long>> topKwithName = topKReducer.join(nodes)
.where(1).equalTo(1).flatMap(new ProjectNodeWithName());
topKwithName.writeAsCsv(argPathOut, WriteMode.OVERWRITE);
env.execute();
}
示例12: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
DataSource<String> inputArc = env.readTextFile(argPathToArc);
DataSource<String> inputIndex = env.readTextFile(argPathToIndex);
DataSet<Tuple2<String, Long>> nodes = inputIndex
.flatMap(new NodeReader());
/* Convert the input to edges, consisting of (source, target) */
DataSet<Tuple2<Long, Long>> arcs = inputArc.flatMap(new ArcReader());
/* Compute the degree of every vertex */
DataSet<Tuple2<Long, Long>> verticesWithDegree = arcs
.<Tuple1<Long>> project(1).groupBy(0)
.reduceGroup(new DegreeOfVertex());
// Focus on the nodes' degree higher than certain degree
DataSet<Tuple2<Long, Long>> highOutDegree = verticesWithDegree
.filter(new DegreeFilter());
// Output 1, ID, degree for group by 0
DataSet<Tuple3<Long, Long, Long>> topKMapper = highOutDegree
.flatMap(new TopKMapper());
// Get topK
DataSet<Tuple3<Long, Long, Long>> topKReducer = topKMapper.groupBy(0)
.sortGroup(2, Order.DESCENDING).first(topK);
// Node ID joins with node's name
DataSet<Tuple2<String, Long>> topKwithName = topKReducer.join(nodes)
.where(1).equalTo(1).flatMap(new ProjectNodeWithName());
topKwithName.writeAsCsv(argPathOut, WriteMode.OVERWRITE);
env.execute();
}
示例13: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
// Read the input files - pages and links
DataSource<String> inputPages = env.readTextFile(argPathToIndex);
DataSet<Tuple1<Long>> pages = inputPages.flatMap(new PageReader());
DataSource<String> inputLinks = env.readTextFile(argPathToArc);
DataSet<Tuple2<Long, Long>> links = inputLinks
.flatMap(new LinkReader());
// Get the total count of pages
DataSet<Long> numPages = pages.reduceGroup(new CountPages());
// Find sinks
DataSet<Tuple1<Long>> noOutgoingLinks = pages.flatMap(new FindSinks())
.withBroadcastSet(links.<Tuple1<Long>> project(0).distinct(),
"pages");
// Point sinks to all other nodes
DataSet<Tuple2<Long, Long>> sinksToAll = noOutgoingLinks.flatMap(
new PointToAllOther()).withBroadcastSet(pages, "pages");
// Assign the initial rank to every page - 1 / numPages
DataSet<Tuple2<Long, Double>> pagesRanked = pages.map(
new InitialRanking()).withBroadcastSet(numPages, "numPages");
// Encode sparse adjacency matrix to a list
DataSet<Tuple2<Long, Long[]>> sparseMatrix = links.union(sinksToAll)
.groupBy(0).reduceGroup(new BuildList());
// Start iteration - Not using DeltaIteration since the whole DataSet is
// recomputed
IterativeDataSet<Tuple2<Long, Double>> iterationSet = pagesRanked
.iterate(maxIterations);
DataSet<Tuple2<Long, Double>> pageRank = iterationSet
.
// Iteratively join the iterationSet with the sparseMatrix
join(sparseMatrix).where(0)
.equalTo(0)
.flatMap(new DistributePageRank())
.groupBy(0)
.sum(1)
.
// To implement the random teleport behaviour we recompute the
// pageRank
// and applying a function on each PageRank which is given by
// beta * pageRank + ((1 - beta) / numPages)
map(new RandomTeleport())
.withBroadcastSet(numPages, "numPages");
DataSet<Tuple2<Long, Double>> results = iterationSet.closeWith(
pageRank, pageRank.join(iterationSet).where(0).equalTo(0)
.filter(new ConvergenceCondition()));
results.writeAsCsv(argPathOut, WriteMode.OVERWRITE);
env.execute();
}
示例14: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (!parseParameters(args)) {
return;
}
ExecutionEnvironment env = ExecutionEnvironment
.getExecutionEnvironment();
// Read the input files - pages and links
DataSource<String> inputPages = env.readTextFile(argPathToIndex);
DataSet<Tuple1<Long>> pages = inputPages.flatMap(new PageReader());
DataSource<String> inputLinks = env.readTextFile(argPathToArc);
DataSet<Tuple2<Long, Long>> links = inputLinks
.flatMap(new LinkReader());
// Get the total count of pages
DataSet<Long> numPages = pages.reduceGroup(new CountPages());
// Find sinks
DataSet<Tuple1<Long>> noOutgoingLinks = pages.flatMap(new FindSinks())
.withBroadcastSet(
links.<Tuple1<Long>>project(0).distinct(), "pages");
// Point sinks to all other nodes
DataSet<Tuple2<Long, Long>> sinksToAll = noOutgoingLinks.flatMap(
new PointToAllOther()).withBroadcastSet(pages, "pages");
// Assign the initial rank to every page - 1 / numPages
DataSet<Tuple2<Long, Double>> pagesRanked = pages.map(
new InitialRanking()).withBroadcastSet(numPages, "numPages");
// Encode sparse adjacency matrix to a list
DataSet<Tuple2<Long, Long[]>> sparseMatrix = links.union(sinksToAll)
.groupBy(0).reduceGroup(new BuildList());
// Start iteration - Not using DeltaIteration since the whole DataSet is
// recomputed
IterativeDataSet<Tuple2<Long, Double>> iterationSet = pagesRanked
.iterate(maxIterations);
DataSet<Tuple2<Long, Double>> pageRank = iterationSet
.
// Iteratively join the iterationSet with the sparseMatrix
join(sparseMatrix).where(0)
.equalTo(0)
.flatMap(new DistributePageRank())
.groupBy(0)
.sum(1)
.
// To implement the random teleport behaviour we recompute the
// pageRank
// and applying a function on each PageRank which is given by
// beta * pageRank + ((1 - beta) / numPages)
map(new RandomTeleport())
.withBroadcastSet(numPages, "numPages");
DataSet<Tuple2<Long, Double>> resultsPageRank = iterationSet.closeWith(
pageRank, pageRank.join(iterationSet).where(0).equalTo(0)
.filter(new ConvergenceCondition()));
DataSet<Tuple2<Long, Double>> filterPageRank = resultsPageRank
.filter(new TopKFilter());
// Emit (1,node,PageRank)
DataSet<Tuple3<Long, Long, Double>> mapPageRank = filterPageRank
.flatMap(new TopKMapper());
DataSet<Tuple2<Long, Double>> results = mapPageRank.groupBy(0)
.sortGroup(2, Order.DESCENDING).first(topK).<Tuple2<Long, Double>>project(1, 2);
results.writeAsCsv(argPathOut, WriteMode.OVERWRITE);
env.execute();
}
示例15: main
import org.apache.flink.api.java.operators.DataSource; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSource<String> input = env.readTextFile(Config.pathToTrainingSet());
// read input with df-cut
DataSet<Tuple3<String, String, Long>> labeledTerms = input.flatMap(new DataReader());
// conditional counter per word per label
DataSet<Tuple3<String, String, Long>> termCounts = null; // IMPLEMENT ME
termCounts.writeAsCsv(Config.pathToConditionals(), "\n", "\t", FileSystem.WriteMode.OVERWRITE);
// word counts per label
DataSet<Tuple2<String, Long>> termLabelCounts = null; // IMPLEMENT ME
termLabelCounts.writeAsCsv(Config.pathToSums(), "\n", "\t", FileSystem.WriteMode.OVERWRITE);
env.execute();
}