本文整理汇总了Python中pyspark.mllib.feature.HashingTF.map方法的典型用法代码示例。如果您正苦于以下问题:Python HashingTF.map方法的具体用法?Python HashingTF.map怎么用?Python HashingTF.map使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.feature.HashingTF
的用法示例。
在下文中一共展示了HashingTF.map方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: naivebayes_mllib
# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import map [as 别名]
def naivebayes_mllib():
AWS_ACCESS_KEY_ID = "XXXXXXXXXXXXXXXXXX"
AWS_SECRET_ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY)
tr_folder = "s3n://usf-ml2/hwspark/train/"
tr_neg_path = tr_folder+ "neg/*.txt"
neg_files = sc.textFile(tr_neg_path)
neg = neg_files.map(lambda x: parsedoc(x))
neg = neg.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
neg1= neg.flatMap(lambda x:x.split())
neg1 = neg1.map(lambda x: removeStopWords(x))
tf = HashingTF().transform(neg1.map(lambda x: x, preservesPartitioning=True))
neg_tr = tf.map(lambda x: LabeledPoint(0.0, x))
tr_pos_path = tr_folder+ "pos/*.txt"
pos_files = sc.textFile(tr_pos_path)
pos = pos_files.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
pos = pos.map(lambda x: parsedoc(x))
pos1= pos.flatMap(lambda x:x.split())
pos1 = pos1.map(lambda x: removeStopWords(x))
tf_pos = HashingTF().transform(pos1.map(lambda x: x, preservesPartitioning=True))
pos_tr = tf_pos.map(lambda x: LabeledPoint(1.0, x))
training = neg_tr.union(pos_tr)
model = NaiveBayes.train(training)
te_folder = "s3n://usf-ml2/hw_spark/test/"
test_Npath = te_folder+"neg/*.txt"
test_Ppath = te_folder+ "pos/*.txt"
test = sc.textFile(test_Npath)
test_p = sc.textFile(test_Ppath)
test = test.map(lambda x: parsedoc(x))
test2= test.flatMap(lambda x:x.split())
test1 = test2.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
test2 = test1.map(lambda x: removeStopWords(x))
tf1 = HashingTF().transform(test2.map(lambda x: x, preservesPartitioning=True))
test5 = tf1.map(lambda x: LabeledPoint(0.0, x))
test_p = test_p.map(lambda x: parsedoc(x))
test_p1 = test_p.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower())
test_p2= test_p1.flatMap(lambda x:x.split())
test_p2 = test_p2.map(lambda x: removeStopWords(x))
tf_p1 = HashingTF().transform(test_p2.map(lambda x: x, preservesPartitioning=True))
test_p5 = tf_p1.map(lambda x: LabeledPoint(1.0, x))
testpn = test5.union(test_p5)
predictionAndLabel = testpn.map(lambda p: (model.predict(p.features), p.label))
accuracy = predictionAndLabel.filter(lambda (x, v): x == v).count()*1.0 /float(test2.count()+test_p2.count())
print "Accuracy is {}".format(round(accuracy,5))
示例2: create_labelPoints
# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import map [as 别名]
def create_labelPoints(rawdata, label):
tf = HashingTF().transform(
rawdata.map(lambda doc: doc[1].lower().split(" "), preservesPartitioning=True))
return(tf.map(lambda x: LabeledPoint(label, x)))