本文整理汇总了Scala中org.apache.spark.unsafe.types.UTF8String类的典型用法代码示例。如果您正苦于以下问题:Scala UTF8String类的具体用法?Scala UTF8String怎么用?Scala UTF8String使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了UTF8String类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: ContentToHash
//设置package包名称以及导入依赖的类
package com.xuzq.hotNews
import org.apache.spark.SparkException
import org.apache.spark.unsafe.hash.Murmur3_x86_32._
import org.apache.spark.unsafe.types.UTF8String
class ContentToHash {
val seed = 42
def getHashCode(word:String, mod: Int): Int ={
return nonNegativeMod(murmur3Hash(word), mod)
}
def nonNegativeMod(x: Int, mod: Int): Int = {
val rawMod = x % mod
rawMod + (if (rawMod < 0) mod else 0)
}
def murmur3Hash(term: Any): Int = {
term match {
case null => seed
case b: Boolean => hashInt(if (b) 1 else 0, seed)
case b: Byte => hashInt(b, seed)
case s: Short => hashInt(s, seed)
case i: Int => hashInt(i, seed)
case l: Long => hashLong(l, seed)
case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
case s: String =>
val utf8 = UTF8String.fromString(s)
hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed)
case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " +
s"support type ${term.getClass.getCanonicalName} of input data.")
}
}
}
示例2: Of
//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.udts
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
import org.apache.spark.sql.catalyst.expressions.UnsafeMapData
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@SQLUserDefinedType(udt = classOf[EmbeddedMapType])
case class EmbeddedMap(elements: Map[Any, Any]) extends Serializable {
override def hashCode(): Int = 1
override def equals(other: scala.Any): Boolean = other match {
case that: EmbeddedMap => that.elements == this.elements
case _ => false
}
override def toString: String = elements.mkString(", ")
}
class EmbeddedMapType extends UserDefinedType[EmbeddedMap] {
override def sqlType: DataType = MapType(StringType, StringType)
override def serialize(obj: EmbeddedMap): Any = {
ArrayBasedMapData(obj.elements.keySet.map{ elem =>
val outKey = new ByteArrayOutputStream()
val osKey = new ObjectOutputStream(outKey)
osKey.writeObject(elem)
UTF8String.fromBytes(outKey.toByteArray)
}.toArray,
obj.elements.values.map{ elem =>
val outValue = new ByteArrayOutputStream()
val osValue = new ObjectOutputStream(outValue)
osValue.writeObject(elem)
UTF8String.fromBytes(outValue.toByteArray)
}.toArray)
}
override def deserialize(datum: Any): EmbeddedMap = {
datum match {
case values: UnsafeMapData =>
new EmbeddedMap(values.keyArray().toArray[UTF8String](StringType).map{ elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject()
}.zip(values.valueArray().toArray[UTF8String](StringType).map{ elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject()
}).toMap)
case other => sys.error(s"Cannot deserialize $other")
}
}
override def userClass: Class[EmbeddedMap] = classOf[EmbeddedMap]
}
object EmbeddedMapType extends EmbeddedMapType
示例3: Of
//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.udts
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@SQLUserDefinedType(udt = classOf[EmbeddedSetType])
case class EmbeddedSet(elements: Array[Any]) extends Serializable {
override def hashCode(): Int = {
var hashCode = 1
val i = elements.iterator
while (i.hasNext) {
val obj = i.next()
val elemValue = if (obj == null) 0 else obj.hashCode()
hashCode = 31 * hashCode + elemValue
}
hashCode
}
override def equals(other: scala.Any): Boolean = other match {
case that: EmbeddedSet => that.elements.sameElements(this.elements)
case _ => false
}
override def toString: String = elements.mkString(", ")
}
class EmbeddedSetType extends UserDefinedType[EmbeddedSet] {
override def sqlType: DataType = ArrayType(StringType)
override def serialize(obj: EmbeddedSet): Any = {
new GenericArrayData(obj.elements.map{elem =>
val out = new ByteArrayOutputStream()
val os = new ObjectOutputStream(out)
os.writeObject(elem)
UTF8String.fromBytes(out.toByteArray)
})
}
override def deserialize(datum: Any): EmbeddedSet = {
datum match {
case values: ArrayData =>
new EmbeddedSet(values.toArray[UTF8String](StringType).map{ elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject()
})
case other => sys.error(s"Cannot deserialize $other")
}
}
override def userClass: Class[EmbeddedSet] = classOf[EmbeddedSet]
}
object EmbeddedSetType extends EmbeddedSetType
示例4: Of
//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.udts
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
import com.orientechnologies.orient.core.record.ORecord
import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@SQLUserDefinedType(udt = classOf[LinkSetType])
case class LinkSet(elements: Array[_ <: ORecord]) extends Serializable {
override def hashCode(): Int = {
var hashCode = 1
val i = elements.iterator
while (i.hasNext) {
val obj = i.next()
val elemValue = if (obj == null) 0 else obj.hashCode()
hashCode = 31 * hashCode + elemValue
}
hashCode
}
override def equals(other: scala.Any): Boolean = other match {
case that: LinkSet => that.elements.sameElements(this.elements)
case _ => false
}
override def toString: String = elements.mkString(", ")
}
class LinkSetType extends UserDefinedType[LinkSet] {
override def sqlType: DataType = ArrayType(StringType)
override def serialize(obj: LinkSet): Any = {
new GenericArrayData(obj.elements.map{elem =>
val out = new ByteArrayOutputStream()
val os = new ObjectOutputStream(out)
os.writeObject(elem)
UTF8String.fromBytes(out.toByteArray)
})
}
override def deserialize(datum: Any): LinkSet = {
datum match {
case values: ArrayData =>
new LinkSet(values.toArray[UTF8String](StringType).map{ elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject().asInstanceOf[ORecord]
})
case other => sys.error(s"Cannot deserialize $other")
}
}
override def userClass: Class[LinkSet] = classOf[LinkSet]
}
object LinkSetType extends LinkSetType
示例5: Of
//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.udts
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
import com.orientechnologies.orient.core.record.ORecord
import org.apache.spark.sql.catalyst.expressions.UnsafeMapData
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@SQLUserDefinedType(udt = classOf[LinkMapType])
case class LinkMap(elements: Map[String, _ <: ORecord]) extends Serializable {
override def hashCode(): Int = 1
override def equals(other: scala.Any): Boolean = other match {
case that: LinkMap => that.elements == this.elements
case _ => false
}
override def toString: String = elements.mkString(", ")
}
class LinkMapType extends UserDefinedType[LinkMap] {
override def sqlType: DataType = MapType(StringType, StringType)
override def serialize(obj: LinkMap): Any = {
ArrayBasedMapData(obj.elements.keySet.map{ elem =>
val outKey = new ByteArrayOutputStream()
val osKey = new ObjectOutputStream(outKey)
osKey.writeObject(elem)
UTF8String.fromBytes(outKey.toByteArray)
}.toArray,
obj.elements.values.map{ elem =>
val outValue = new ByteArrayOutputStream()
val osValue = new ObjectOutputStream(outValue)
osValue.writeObject(elem)
UTF8String.fromBytes(outValue.toByteArray)
}.toArray)
}
override def deserialize(datum: Any): LinkMap = {
datum match {
case values: UnsafeMapData =>
new LinkMap(values.keyArray().toArray[UTF8String](StringType).map { elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject().toString
}.zip(values.valueArray().toArray[UTF8String](StringType).map { elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject().asInstanceOf[ORecord]
}).toMap)
case other => sys.error(s"Cannot deserialize $other")
}
}
override def userClass: Class[LinkMap] = classOf[LinkMap]
}
object LinkMapType extends LinkMapType
示例6: Of
//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.udts
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@SQLUserDefinedType(udt = classOf[EmbeddedListType])
case class EmbeddedList(elements: Array[Any]) extends Serializable {
override def hashCode(): Int = {
var hashCode = 1
val i = elements.iterator
while (i.hasNext) {
val obj = i.next()
val elemValue = if (obj == null) 0 else obj.hashCode()
hashCode = 31 * hashCode + elemValue
}
hashCode
}
override def equals(other: scala.Any): Boolean = other match {
case that: EmbeddedList => that.elements.sameElements(this.elements)
case _ => false
}
override def toString: String = elements.mkString(", ")
}
class EmbeddedListType extends UserDefinedType[EmbeddedList] {
override def sqlType: DataType = ArrayType(StringType)
override def serialize(obj: EmbeddedList): Any = {
new GenericArrayData(obj.elements.map{elem =>
val out = new ByteArrayOutputStream()
val os = new ObjectOutputStream(out)
os.writeObject(elem)
UTF8String.fromBytes(out.toByteArray)
})
}
override def deserialize(datum: Any): EmbeddedList = {
datum match {
case values: ArrayData =>
new EmbeddedList(values.toArray[UTF8String](StringType).map{ elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject()
})
case other => sys.error(s"Cannot deserialize $other")
}
}
override def userClass: Class[EmbeddedList] = classOf[EmbeddedList]
}
object EmbeddedListType extends EmbeddedListType
示例7: Of
//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.udts
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
import com.orientechnologies.orient.core.record.ORecord
import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@SQLUserDefinedType(udt = classOf[LinkListType])
case class LinkList(elements: Array[_ <: ORecord]) extends Serializable {
override def hashCode(): Int = {
var hashCode = 1
val i = elements.iterator
while (i.hasNext) {
val obj = i.next()
val elemValue = if (obj == null) 0 else obj.hashCode()
hashCode = 31 * hashCode + elemValue
}
hashCode
}
override def equals(other: scala.Any): Boolean = other match {
case that: LinkList => that.elements.sameElements(this.elements)
case _ => false
}
override def toString: String = elements.mkString(", ")
}
class LinkListType extends UserDefinedType[LinkList] {
override def sqlType: DataType = ArrayType(StringType)
override def serialize(obj: LinkList): Any = {
new GenericArrayData(obj.elements.map{ elem =>
val out = new ByteArrayOutputStream()
val os = new ObjectOutputStream(out)
os.writeObject(elem)
UTF8String.fromBytes(out.toByteArray)
})
}
override def deserialize(datum: Any): LinkList = {
datum match {
case values: ArrayData =>
new LinkList(values.toArray[UTF8String](StringType).map{ elem =>
val in = new ByteArrayInputStream(elem.getBytes)
val is = new ObjectInputStream(in)
is.readObject().asInstanceOf[ORecord]
})
case other => sys.error(s"Cannot deserialize $other")
}
}
override def userClass: Class[LinkList] = classOf[LinkList]
}
object LinkListType extends LinkListType
示例8: DefaultSource
//设置package包名称以及导入依赖的类
package pl.jborkowskijmartin.spark.mf
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.unsafe.types.UTF8String
class DefaultSource extends FileFormat {
override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]):
Option[StructType] = {
println(">>>InferSchema")
Some(StructType(
StructField("line", StringType, nullable = true) :: Nil
))
}
override def prepareWrite(sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType):
OutputWriterFactory = {
println(">>> prepareWrite")
null
}
override def buildReader(sparkSession: SparkSession, dataSchema: StructType, partitionSchema: StructType,
requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String],
hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
pf => Iterator(InternalRow(UTF8String.fromString("hello")))
}
}