本文整理汇总了Java中org.kitesdk.data.crunch.CrunchDatasets类的典型用法代码示例。如果您正苦于以下问题:Java CrunchDatasets类的具体用法?Java CrunchDatasets怎么用?Java CrunchDatasets使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CrunchDatasets类属于org.kitesdk.data.crunch包,在下文中一共展示了CrunchDatasets类的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: run
import org.kitesdk.data.crunch.CrunchDatasets; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
final long startOfToday = startOfDay();
// the destination dataset
Dataset<Record> persistent = Datasets.load(
"dataset:file:/tmp/data/logs", Record.class);
// the source: anything before today in the staging area
Dataset<Record> staging = Datasets.load(
"dataset:file:/tmp/data/logs_staging", Record.class);
View<Record> ready = staging.toBefore("timestamp", startOfToday);
ReadableSource<Record> source = CrunchDatasets.asSource(ready);
PCollection<Record> stagedLogs = read(source);
getPipeline().write(stagedLogs,
CrunchDatasets.asTarget(persistent), Target.WriteMode.APPEND);
PipelineResult result = run();
if (result.succeeded()) {
// remove the source data partition from staging
ready.deleteAll();
return 0;
} else {
return 1;
}
}
示例2: run
import org.kitesdk.data.crunch.CrunchDatasets; //导入依赖的package包/类
public void run() {
// TODO: Switch to parameterized views.
View<ExampleEvent> view = Datasets.load(ScheduledReportApp.EXAMPLE_DS_URI,
ExampleEvent.class);
RefinableView<GenericRecord> target = Datasets.load(ScheduledReportApp.REPORT_DS_URI,
GenericRecord.class);
// Get the view into which this report will be written.
DateTime dateTime = getNominalTime().toDateTime(DateTimeZone.UTC);
View<GenericRecord> output = target
.with("year", dateTime.getYear())
.with("month", dateTime.getMonthOfYear())
.with("day", dateTime.getDayOfMonth())
.with("hour", dateTime.getHourOfDay())
.with("minute", dateTime.getMinuteOfHour());
Pipeline pipeline = getPipeline();
PCollection<ExampleEvent> events = pipeline.read(CrunchDatasets.asSource(view));
PTable<Long, ExampleEvent> eventsByUser = events.by(new GetEventId(), Avros.longs());
// Count of events by user ID.
PTable<Long, Long> userEventCounts = eventsByUser.keys().count();
PCollection<GenericData.Record> report = userEventCounts.parallelDo(
new ToUserReport(),
Avros.generics(SCHEMA));
pipeline.write(report, CrunchDatasets.asTarget(output));
pipeline.run();
}
示例3: run
import org.kitesdk.data.crunch.CrunchDatasets; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
// Turn debug on while in development.
getPipeline().enableDebug();
getPipeline().getConfiguration().set("crunch.log.job.progress", "true");
Dataset<StandardEvent> eventsDataset = Datasets.load(
"dataset:hdfs:/tmp/data/default/events", StandardEvent.class);
View<StandardEvent> eventsToProcess;
if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
// get the current minute
Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
cal.set(Calendar.SECOND, 0);
cal.set(Calendar.MILLISECOND, 0);
long currentMinute = cal.getTimeInMillis();
// restrict events to before the current minute
// in the workflow, this also has a lower bound for the timestamp
eventsToProcess = eventsDataset.toBefore("timestamp", currentMinute);
} else if (isView(args[0])) {
eventsToProcess = Datasets.load(args[0], StandardEvent.class);
} else {
eventsToProcess = FileSystemDatasets.viewForPath(eventsDataset, new Path(args[0]));
}
if (eventsToProcess.isEmpty()) {
LOG.info("No records to process.");
return 0;
}
// Create a parallel collection from the working partition
PCollection<StandardEvent> events = read(
CrunchDatasets.asSource(eventsToProcess));
// Group events by user and cookie id, then create a session for each group
PCollection<Session> sessions = events
.by(new GetSessionKey(), Avros.strings())
.groupByKey()
.parallelDo(new MakeSession(), Avros.specifics(Session.class));
// Write the sessions to the "sessions" Dataset
getPipeline().write(sessions,
CrunchDatasets.asTarget("dataset:hive:/tmp/data/default/sessions"),
Target.WriteMode.APPEND);
return run().succeeded() ? 0 : 1;
}
示例4: run
import org.kitesdk.data.crunch.CrunchDatasets; //导入依赖的package包/类
public void run(@DataIn(name="example_events", type=ExampleEvent.class) View<ExampleEvent> input,
@DataOut(name="odd_users", type=ExampleEvent.class) View<ExampleEvent> output) {
Pipeline pipeline = getPipeline();
PCollection<ExampleEvent> events = pipeline.read(CrunchDatasets.asSource(input));
PCollection<ExampleEvent> oddUsers = events.filter(new KeepOddUsers());
pipeline.write(oddUsers, CrunchDatasets.asTarget(output));
pipeline.run();
}