本文整理匯總了Java中org.apache.crunch.types.PTypeFamily類的典型用法代碼示例。如果您正苦於以下問題:Java PTypeFamily類的具體用法?Java PTypeFamily怎麽用?Java PTypeFamily使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
PTypeFamily類屬於org.apache.crunch.types包,在下文中一共展示了PTypeFamily類的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: lloydsAlgorithm
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
/**
* Runs Lloyd's algorithm on the given points for a given number of iterations, returning the final
* centers that result.
*
* @param points The data points to cluster
* @param centers The list of initial centers
* @param numIterations The number of iterations to run, with each iteration corresponding to a MapReduce job
* @param approx Whether to use random projection for assigning points to centers
*/
public <V extends RealVector> List<Centers> lloydsAlgorithm(PCollection<V> points, List<Centers> centers,
int numIterations, boolean approx) {
PTypeFamily tf = points.getTypeFamily();
PTableType<Pair<Integer, Integer>, Pair<V, Long>> ptt = tf.tableOf(tf.pairs(tf.ints(), tf.ints()),
tf.pairs(points.getPType(), tf.longs()));
Aggregator<Pair<V, Long>> agg = new SumVectorsAggregator<V>();
for (int i = 0; i < numIterations; i++) {
KSketchIndex index = new KSketchIndex(centers, projectionBits, projectionSamples, seed);
LloydsMapFn<V> mapFn = new LloydsMapFn<V>(index, approx);
centers = new LloydsCenters<V>(points.parallelDo("lloyds-" + i, mapFn, ptt)
.groupByKey()
.combineValues(agg), centers.size()).getValue();
}
return centers;
}
示例2: apply
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
public <T> PCollection<Pair<Integer, T>> apply(PCollection<T> pcollect) {
PTypeFamily ptf = pcollect.getTypeFamily();
PType<Pair<Integer, T>> pt = ptf.pairs(ptf.ints(), pcollect.getPType());
return pcollect.parallelDo("crossfold", new MapFn<T, Pair<Integer, T>>() {
private transient RandomGenerator rand;
@Override
public void initialize() {
if (rand == null) {
this.rand = RandomManager.getSeededRandom(seed);
}
}
@Override
public Pair<Integer, T> map(T t) {
return Pair.of(rand.nextInt(numFolds), t);
}
}, pt);
}
示例3: groupedWeightedSample
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
public static <K, T, N extends Number> PTable<K, T> groupedWeightedSample(
PTable<K, Pair<T, N>> input,
int sampleSize,
RandomGenerator random) {
PTypeFamily ptf = input.getTypeFamily();
PType<K> keyType = input.getPTableType().getKeyType();
@SuppressWarnings("unchecked")
PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0);
PTableType<K, Pair<Double, T>> ptt = ptf.tableOf(keyType, ptf.pairs(ptf.doubles(), ttype));
// fill reservoirs by mapping over the vectors and re-emiting them; each map task emits at most sampleSize
// vectors per fold; the combiner/reducer will combine the outputs and pare down to sampleSize vectors total
PTable<K, Pair<Double, T>> samples = input.parallelDo("reservoirSampling",
new SampleFn<K, T, N>(sampleSize, random, ttype), ptt);
// pare down to just a single reservoir with sampleSize vectors
PTable<K, Pair<Double, T>> reservoir = samples.groupByKey(1).combineValues(new WRSCombineFn<K, T>(sampleSize, ttype));
// strip the weights off the final sampled reservoir and return
return reservoir.parallelDo("strippingSamplingWeights", new MapFn<Pair<K, Pair<Double, T>>, Pair<K, T>>() {
@Override
public Pair<K, T> map(Pair<K, Pair<Double, T>> p) {
return Pair.of(p.first(), p.second().second());
}
}, ptf.tableOf(keyType, ttype));
}
示例4: swapKeyValue
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
/**
* Swap the key and value part of a PTable. The original PTypes are used in the opposite order
* @param table PTable to process
* @param <K> Key type (will become value type)
* @param <V> Value type (will become key type)
* @return PType<V, K> containing the same data as the original
*/
public static <K, V> PTable<V, K> swapKeyValue(PTable<K, V> table) {
PTypeFamily ptf = table.getTypeFamily();
return table.parallelDo(new MapFn<Pair<K, V>, Pair<V, K>>() {
@Override
public Pair<V, K> map(Pair<K, V> input) {
return Pair.of(input.second(), input.first());
}
}, ptf.tableOf(table.getValueType(), table.getKeyType()));
}
示例5: sample
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
public static <T> PCollection<T> sample(
PCollection<T> input,
int sampleSize,
RandomGenerator random) {
PTypeFamily ptf = input.getTypeFamily();
PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints());
return weightedSample(
input.parallelDo(new MapFn<T, Pair<T, Integer>>() {
@Override
public Pair<T, Integer> map(T t) { return Pair.of(t, 1); }
}, ptype),
sampleSize,
random);
}
示例6: weightedSample
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
public static <T, N extends Number> PCollection<T> weightedSample(
PCollection<Pair<T, N>> input,
int sampleSize,
RandomGenerator random) {
PTypeFamily ptf = input.getTypeFamily();
PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
@Override
public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
return Pair.of(0, p);
}
}, ptf.tableOf(ptf.ints(), input.getPType()));
return groupedWeightedSample(groupedIn, sampleSize, random).values();
}
示例7: distributed
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
/**
* Calculate a set of percentiles for each key in a numerically-valued table.
*
* Percentiles are calculated on a per-key basis by counting, joining and sorting. This is highly scalable, but takes
* 2 more map-reduce cycles than if you can guarantee that the value set will fit into memory. Use inMemory
* if you have less than the order of 10M values per key.
*
* The percentile definition that we use here is the "nearest rank" defined here:
* http://en.wikipedia.org/wiki/Percentile#Definition
*
* @param table numerically-valued PTable
* @param p1 First percentile (in the range 0.0 - 1.0)
* @param pn More percentiles (in the range 0.0 - 1.0)
* @param <K> Key type of the table
* @param <V> Value type of the table (must extends java.lang.Number)
* @return PTable of each key with a collection of pairs of the percentile provided and it's result.
*/
public static <K, V extends Number> PTable<K, Result<V>> distributed(PTable<K, V> table,
double p1, double... pn) {
final List<Double> percentileList = createListFromVarargs(p1, pn);
PTypeFamily ptf = table.getTypeFamily();
PTable<K, Long> totalCounts = table.keys().count();
PTable<K, Pair<Long, V>> countValuePairs = totalCounts.join(table);
PTable<K, Pair<V, Long>> valueCountPairs =
countValuePairs.mapValues(new SwapPairComponents<Long, V>(), ptf.pairs(table.getValueType(), ptf.longs()));
return SecondarySort.sortAndApply(
valueCountPairs,
new DistributedPercentiles<K, V>(percentileList),
ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType())));
}
示例8: csvRecord
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
public static PType<Record> csvRecord(PTypeFamily ptf, String delim) {
return ptf.derived(Record.class,
new CSV2RecordMapFn(delim),
new Record2CSVMapFn(delim),
ptf.strings());
}
示例9: inMemory
import org.apache.crunch.types.PTypeFamily; //導入依賴的package包/類
/**
* Calculate a set of percentiles for each key in a numerically-valued table.
*
* Percentiles are calculated on a per-key basis by grouping, reading the data into memory, then sorting and
* and calculating. This is much faster than the distributed option, but if you get into the order of 10M+ per key, then
* performance might start to degrade or even cause OOMs.
*
* The percentile definition that we use here is the "nearest rank" defined here:
* http://en.wikipedia.org/wiki/Percentile#Definition
*
* @param table numerically-valued PTable
* @param p1 First percentile (in the range 0.0 - 1.0)
* @param pn More percentiles (in the range 0.0 - 1.0)
* @param <K> Key type of the table
* @param <V> Value type of the table (must extends java.lang.Number)
* @return PTable of each key with a collection of pairs of the percentile provided and it's result.
*/
public static <K, V extends Comparable> PTable<K, Result<V>> inMemory(PTable<K, V> table,
double p1, double... pn) {
final List<Double> percentileList = createListFromVarargs(p1, pn);
PTypeFamily ptf = table.getTypeFamily();
return table
.groupByKey()
.parallelDo(new InMemoryPercentiles<K, V>(percentileList),
ptf.tableOf(table.getKeyType(), Result.pType(table.getValueType())));
}