本文整理汇总了Scala中org.apache.hadoop.io.LongWritable类的典型用法代码示例。如果您正苦于以下问题:Scala LongWritable类的具体用法?Scala LongWritable怎么用?Scala LongWritable使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LongWritable类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: FPMiningPreprocessingApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.HadoopRDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import com.google.common.io.Files
object FPMiningPreprocessingApp {
def main(args: Array[String]) {
if (args.length != 3) {
System.err.println(
"Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>")
System.exit(1)
}
val Seq(appName, iPath, oPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val delim = " "
val sc = new SparkContext(conf)
sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
.asInstanceOf[HadoopRDD[LongWritable, Text]]
.mapPartitionsWithInputSplit((iSplit, iter) =>
iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
.filter(r => r._2 != "0")
.map(r => (r._1, r._2))
.distinct()
.groupByKey()
.map(r => r._2.mkString(" "))
.sample(false, 0.7)
.coalesce(1)
.saveAsTextFile(oPath)
}
}
示例2: RedditVariationApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
object RedditVariationApp {
def main(args: Array[String]) {
if (args.length != 2) {
System.err.println(
"Usage: RedditVariationApp <appname> <input_path>")
System.exit(1)
}
val Seq(appName, inputPath) = args.toSeq
val LOG = LogManager.getLogger(this.getClass)
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(1))
LOG.info("Started at %d".format(ssc.sparkContext.startTime))
val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
val merged = comments.union(comments)
val repartitionedComments = comments.repartition(4)
val rddMin = comments.glom().map(arr =>
arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt)))
ssc.start()
ssc.awaitTermination()
}
}
示例3: FunctionalSyntaxOWLExpressionsDataSetBuilder
//设置package包名称以及导入依赖的类
package net.sansa_stack.owl.flink.dataset
import net.sansa_stack.owl.common.parsing.{FunctionalSyntaxExpressionBuilder, FunctionalSyntaxPrefixParsing}
import net.sansa_stack.owl.flink.hadoop.FunctionalSyntaxInputFormat
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.hadoop.io.{LongWritable, Text}
object FunctionalSyntaxOWLExpressionsDataSetBuilder extends FunctionalSyntaxPrefixParsing {
def build(env: ExecutionEnvironment, filePath: String): OWLExpressionsDataSet = {
import org.apache.flink.api.scala._
val hadoopDataSet: DataSet[(LongWritable, Text)] =
env.readHadoopFile[LongWritable, Text](
new FunctionalSyntaxInputFormat,
classOf[LongWritable],
classOf[Text],
filePath
)
val rawDataSet = hadoopDataSet.map(_._2.toString)
val tmp: Seq[(String, String)] = rawDataSet.filter(isPrefixDeclaration(_)).map(parsePrefix(_)).collect()
val prefixes: Map[String, String] = tmp.toMap
val builder = new FunctionalSyntaxExpressionBuilder(prefixes)
rawDataSet.map(builder.clean(_)).filter(_ != null)
}
}
示例4: ManchesterSyntaxOWLExpressionsDataSetBuilder
//设置package包名称以及导入依赖的类
package net.sansa_stack.owl.flink.dataset
import net.sansa_stack.owl.common.parsing.{ManchesterSyntaxExpressionBuilder, ManchesterSyntaxPrefixParsing}
import net.sansa_stack.owl.flink.hadoop.ManchesterSyntaxInputFormat
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.hadoop.io.{LongWritable, Text}
object ManchesterSyntaxOWLExpressionsDataSetBuilder extends ManchesterSyntaxPrefixParsing {
def build(env: ExecutionEnvironment, filePath: String): OWLExpressionsDataSet = {
buildAndGetPrefixes(env, filePath)._1
}
private[dataset] def buildAndGetPrefixes(env: ExecutionEnvironment,
filePath: String): (OWLExpressionsDataSet, Map[String, String]) = {
import org.apache.flink.api.scala._
val hadoopDataSet: DataSet[(LongWritable, Text)] =
env.readHadoopFile[LongWritable, Text](
new ManchesterSyntaxInputFormat,
classOf[LongWritable],
classOf[Text],
filePath
)
val rawDataSet = hadoopDataSet.map(_._2.toString)
val tmp: Seq[(String, String)] = rawDataSet.filter(isPrefixDeclaration(_)).map(parsePrefix(_)).collect()
val prefixes: Map[String, String] = tmp.toMap
val builder = new ManchesterSyntaxExpressionBuilder(prefixes)
(rawDataSet.map(builder.clean(_)).filter(_ != null), prefixes)
}
}
示例5: ManchesterSyntaxOWLExpressionsRDDBuilder
//设置package包名称以及导入依赖的类
package net.sansa_stack.owl.spark.rdd
import net.sansa_stack.owl.common.parsing.{ManchesterSyntaxExpressionBuilder, ManchesterSyntaxPrefixParsing}
import net.sansa_stack.owl.spark.hadoop.ManchesterSyntaxInputFormat
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.spark.SparkContext
object ManchesterSyntaxOWLExpressionsRDDBuilder extends ManchesterSyntaxPrefixParsing {
def build(sc: SparkContext, filePath: String): OWLExpressionsRDD = {
buildAndGetPrefixes(sc, filePath)._1
}
private[spark] def buildAndGetPrefixes(sc: SparkContext, filePath: String): (OWLExpressionsRDD, Map[String, String]) = {
val rawRDD = sc.hadoopFile(
filePath,
classOf[ManchesterSyntaxInputFormat],
classOf[LongWritable],
classOf[Text],
sc.defaultMinPartitions).map(_._2.toString)
val tmp: Array[(String, String)] =
rawRDD.filter(isPrefixDeclaration(_)).map(parsePrefix).collect()
val prefixes: Map[String, String] = tmp.toMap
val builder = new ManchesterSyntaxExpressionBuilder(prefixes)
(rawRDD.map(builder.clean(_)).filter(_ != null), prefixes)
}
}
示例6: FunctionalSyntaxOWLExpressionsRDDBuilder
//设置package包名称以及导入依赖的类
package net.sansa_stack.owl.spark.rdd
import net.sansa_stack.owl.common.parsing.{FunctionalSyntaxExpressionBuilder, FunctionalSyntaxPrefixParsing}
import net.sansa_stack.owl.spark.hadoop.FunctionalSyntaxInputFormat
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.spark.SparkContext
object FunctionalSyntaxOWLExpressionsRDDBuilder extends Serializable with FunctionalSyntaxPrefixParsing {
def build(sc: SparkContext, filePath: String): OWLExpressionsRDD = {
val hadoopRDD = sc.hadoopFile(
filePath, classOf[FunctionalSyntaxInputFormat], classOf[LongWritable],
classOf[Text], sc.defaultMinPartitions)
val rawRDD = hadoopRDD.map(entry => entry._2.toString)
val tmp: Array[(String, String)] =
rawRDD.filter(isPrefixDeclaration(_)).map(parsePrefix).collect()
val prefixes: Map[String, String] = tmp.toMap
val builder = new FunctionalSyntaxExpressionBuilder(prefixes)
rawRDD.map(builder.clean(_)).filter(_ != null)
}
}
示例7: ADAMContextExtensions
//设置package包名称以及导入依赖的类
package org.bdgenomics.adam.rdd
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.api.java.StorageLevels
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.converters.FastaConverter
import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD
import org.bdgenomics.utils.instrumentation.Metrics
import org.apache.spark.rdd.MetricsContext._
import org.bdgenomics.adam.rdd.feature.FeatureRDD
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.formats.avro.Feature
object ADAMContextExtensions {
implicit class spExt(val sparkContext: SparkContext) extends HDFSFilesExtensions{
def loadFastaPersistent(
filePath: String,
fragmentLength: Long = 10000L): NucleotideContigFragmentRDD = {
val fastaData: RDD[(LongWritable, Text)] = sparkContext.newAPIHadoopFile(
filePath,
classOf[TextInputFormat],
classOf[LongWritable],
classOf[Text]
)
if (Metrics.isRecording) fastaData.instrument() else fastaData
val remapData = fastaData.map(kv => (kv._1.get, kv._2.toString))
// convert rdd and cache
val fragmentRdd = FastaConverter(remapData, fragmentLength)
.persist(StorageLevels.MEMORY_AND_DISK)
NucleotideContigFragmentRDD(fragmentRdd)
}
def mergeFeatures(features: List[FeatureRDD]): Option[FeatureRDD] = features match {
case Nil => None
case head :: Nil => Some(head)
case head :: tail =>
val merged = tail.foldLeft(head){
case (acc, feature) =>
val joined = acc.broadcastRegionJoin(feature)
acc.transform(_ => joined.rdd.map{
case (one, two) =>
one.setStart(Math.min(one.getStart, two.getStart))
one.setEnd(Math.max(one.getEnd, two.getEnd))
one
})
}
Some(merged)
}
}
}
示例8: VoyagerApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
object VoyagerApp {
def main(args: Array[String]) {
if (args.length != 3) {
System.err.println(
"Usage: VoyagerApp <appname> <inputPath> <outputPath>")
System.exit(1)
}
val Seq(appName, inputPath, outputPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
.set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC")
val ssc = new StreamingContext(conf, Seconds(10))
val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
voyager1.map(rec => {
val attrs = rec.split("\\s+")
((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble))
}).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1))
.reduceByKey(_ + _)
.transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath)
ssc.start()
ssc.awaitTermination()
}
}
示例9: VoyagerAppKryo
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
object VoyagerAppKryo {
def main(args: Array[String]) {
if (args.length != 3) {
System.err.println(
"Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>")
System.exit(1)
}
val Seq(appName, inputPath, outputPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.registerKryoClasses(Array(classOf[ProtonFlux]))
val ssc = new StreamingContext(conf, Seconds(10))
val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
val projected = voyager1.map(rec => {
val attrs = rec.split("\\s+")
new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21),
attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27),
attrs(28))
})
val filtered = projected.filter(pflux => pflux.isSolarStorm)
val yearlyBreakdown = filtered.map(rec => (rec.year, 1))
.reduceByKey(_ + _)
.transform(rec => rec.sortByKey(ascending = false))
yearlyBreakdown.saveAsTextFiles(outputPath)
ssc.start()
ssc.awaitTermination()
}
}
示例10: CollabFilteringPreprocessingApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.HadoopRDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import com.google.common.io.Files
object CollabFilteringPreprocessingApp {
def main(args: Array[String]) {
if (args.length != 3) {
System.err.println(
"Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>")
System.exit(1)
}
val Seq(appName, iPath, oPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val delim = " "
val sc = new SparkContext(conf)
sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
.asInstanceOf[HadoopRDD[LongWritable, Text]]
.mapPartitionsWithInputSplit((iSplit, iter) =>
iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
.filter(r => r._2 != "0")
.map(r => ((r._1, r._2), 1))
.reduceByKey(_ + _)
.map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2)
.sample(false, 0.7)
.coalesce(1)
.saveAsTextFile(oPath)
}
}
示例11: RedditAggregationApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
object RedditAggregationApp {
def main(args: Array[String]) {
if (args.length != 2) {
System.err.println(
"Usage: RedditAggregationApp <appname> <input_path>")
System.exit(1)
}
val Seq(appName, inputPath) = args.toSeq
val LOG = LogManager.getLogger(this.getClass)
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(1))
LOG.info("Started at %d".format(ssc.sparkContext.startTime))
val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
val recCount = comments.count()
val recCountValue = comments.countByValue()
val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString))
.flatMap(body => body.split(" "))
.map(word => 1)
.reduce(_ + _)
ssc.start()
ssc.awaitTermination()
}
}
示例12: FrequencyMapper
//设置package包名称以及导入依赖的类
package com.argcv.iphigenia.example.hdfs.mr
import org.apache.hadoop.io.{ IntWritable, LongWritable, Text }
import org.apache.hadoop.mapreduce.Mapper
class FrequencyMapper extends Mapper[LongWritable, Text, Text, IntWritable] {
type Context = Mapper[LongWritable, Text, Text, IntWritable]#Context
override def map(offset: LongWritable, lineText: Text, context: Context): Unit = {
val line = lineText.toString
val eventID: String = line.split(",")(1)
context.write(new Text(eventID), FrequencyMapper.ONE)
}
}
object FrequencyMapper {
def instance = new FrequencyMapper().getClass
lazy val ONE = new IntWritable(1)
}