当前位置: 首页>>代码示例>>Python>>正文


Python HashingTF.transform方法代码示例

本文整理汇总了Python中pyspark.mllib.feature.HashingTF.transform方法的典型用法代码示例。如果您正苦于以下问题:Python HashingTF.transform方法的具体用法?Python HashingTF.transform怎么用?Python HashingTF.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.feature.HashingTF的用法示例。


在下文中一共展示了HashingTF.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
	hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = LogisticRegressionWithSGD.train(trainingData)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "spamFilter.pkl", "wb" ) )

	sc.stop()
开发者ID:badpaper,项目名称:coursework,代码行数:56,代码来源:spamFilter.py

示例2: main

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
                if not w in stopset:
                        tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file
   # print wordArr
    #tokens = sc.textFile("hdfs:/adi/tokens1.txt")

    # Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc,"tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
#    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
开发者ID:aditcoding,项目名称:zfs,代码行数:33,代码来源:ml.py

示例3: tfidf

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
 def tfidf(self):
     self._create_rdd()
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return tfidf
开发者ID:nhu2000,项目名称:wiki-search,代码行数:9,代码来源:make_tfidf.py

示例4: get_feature_vectors

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def get_feature_vectors(sc, input_file, feature_dimensions):
    """Get feature vector from the lines in input_file_obj using
    TF/IDF.

    Returns:
        vectors RDD

    """

    # Load documents (one per line).
    tweet_file = sc.textFile(input_file)
    input_text_rdd = tweet_file.map(lambda line: _tokenize(line))
    input_text_rdd.cache()

    # The default feature dimension is 2^20; for a corpus with million
    # tweets recommended dimensions are 50000 or 100000. Use higher
    # dimensions for larger corpus of tweets.
    hashing_tf = HashingTF(feature_dimensions)
    tf = hashing_tf.transform(input_text_rdd)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()

    return input_text_rdd, tfidf
开发者ID:rohithvsm,项目名称:spark_exercises,代码行数:27,代码来源:tweets_kmeans_classifier.py

示例5: generatedHashedFeatures

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def generatedHashedFeatures(tweet):
    #get label from tweet
    #get text from tweet

    htf = HashingTF(50000)
    lp = LabeledPoint("0", htf.transform(text))
    return lp
开发者ID:LeotisBuchanan,项目名称:stream-data-analysis-realtime,代码行数:9,代码来源:trainNaiveandCreateNaiveBayesModel.py

示例6: TFIDF

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def TFIDF(source, destination):
    if destination[-1] != '/':
        destination=destination+'/'
## typically define the source message
    rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split())
    tf=HashingTF()
    tfVectors=tf.transform(rdd).cache()
    a = tfVectors.collect()
    # Storing the TF values above in individual files, one per link
    ind = 0
    for vector in a:
        dest_path = destination + "TF_%d"%ind + ".txt"
        ind = ind + 1
        file = open(dest_path,'w')
        file.write(str(vector))
        file.close()
    # Calculating IDF Values for each case.
    idf=IDF()
    idfModel=idf.fit(tfVectors)
    tfIdfVectors=idfModel.transform(tfVectors)
    # Writing TF-IDF values to a single file.
    file = open(destination+"TF-IDF.txt", 'w')
    file.write(str(tfIdfVectors.collect()))
    try:
        for i in range(0,100):
            print ""#Testing Printing"
    except KeyboardInterrupt:
            pass
开发者ID:rikinmathur,项目名称:EECS-6895-FINAL-PROJECT,代码行数:30,代码来源:maanittf.py

示例7: main

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def main():
    # 初始化 SparkContext
    sc = spark_context(spark_master)

    # 读取文件
    data = sc.textFile(hdfs_path)

    # 分词
    documents = data.map(tokenize)
    documents.cache()

    # TF
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)

    # IDF
    idf = IDF(minDocFreq=2).fit(tf)
    
    # TFIDF
    tfidf = idf.transform(tf)

    # 链接到 MongoDB
    from pymongo import MongoClient
    mongo_client = MongoClient(mongo_host)
    mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1')
    clear_mongodb(mongo_client)

    # zip
    term_tfidf = documents.zip(tfidf).map(doc_tfidf)
    articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y)
    for article in articles.collect():
        item = {}
        item['text'] = article[0].encode('utf-8')
        item['size'] = int(article[1] * 10)
        send_mongodb(mongo_client, item)
开发者ID:yankaics,项目名称:zhangxinyun-spark,代码行数:37,代码来源:tfidf.py

示例8: tfidf

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def tfidf(rdd_doc):
    hasingTF = HashingTF()
    trainTf = hasingTF.transform(rdd_doc)
    trainTf.cache()
    idf = IDF().fit(trainTf)
    trainTfidf = idf.transform(trainTf)
    trainTfidf.cache()
    return trainTfidf, lambda x: hasingTF.indexOf(x)
开发者ID:hendrydong,项目名称:StackOverFlow_Analysis_PySpark,代码行数:10,代码来源:tfidf_v2.py

示例9: transform

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def transform(idf, article):
    """
    transform article to a sparse vector
    """
    token = tokenizing(article)
    hashingTF = HashingTF()
    tf_test = hashingTF.transform(token)
    return idf.transform(tf_test)
开发者ID:xiaoyubai,项目名称:wiki-search,代码行数:10,代码来源:model.py

示例10: get_tfidf_features

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def get_tfidf_features(txt):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:10,代码来源:amazon_review_tfidf_normalized.py

示例11: vectorize

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def vectorize(sc, rdd_words, size=0):
    '''
       使用TF将词语向量化
       向量的维度需要设定的,默认为2^20
    '''
    if not size:
    	size = rdd_words.flatMap(lambda x:x).distinct().count() + 10000
    hashingTF = HashingTF(size)
    tf = hashingTF.transform(rdd_words)
    return tf
开发者ID:2221758805,项目名称:SparkDemo,代码行数:12,代码来源:demo_vectorize.py

示例12: tfidf

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
 def tfidf(self, tokenizer):
     """
     Get TFIDF matrix rdd with spark tfidf functions
     """
     self._create_rdd(tokenizer)
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return self.rdd, idf, tfidf
开发者ID:xiaoyubai,项目名称:wiki-search,代码行数:12,代码来源:model.py

示例13: tf_idf

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def tf_idf(sc,title_token):
    hashingTF = HashingTF(100)
    title_token = sc.parallelize(title_token)
    tf = hashingTF.transform(title_token)
    print tf, ' tf'
   
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
   
    return tfidf
开发者ID:IcedNecro,项目名称:AWO-61-backend,代码行数:12,代码来源:service_func.py

示例14: mySpark

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
def mySpark(minFreq, keyWord):

    # text cleaning function
    def removePunctuation(text):
        res=text.lower().strip()
        res=re.sub("[^0-9a-zA-Z ]", "", res)
        return res.split(" ")

    # Function for printing each element in RDD
    def println(x):
        for i in x:
            print i

    # Boilerplate Spark stuff:
    conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
    sc = SparkContext(conf = conf)

    # Load documents content (one per line) + cleaning.
    rawData = sc.textFile("list_berita-30.tsv")
    fields = rawData.map(lambda x: x.split("\t"))
    documents = fields.map(lambda x: removePunctuation(x[3]))

    # Get documents content without word mapping
    documentNames = fields.map(lambda x: x[3])

    # TF processing
    hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
    tf = hashingTF.transform(documents)

    # IDF & TF-IDF processing
    tf.cache()
    idf = IDF(minDocFreq=int(minFreq)).fit(tf)
    tfidf = idf.transform(tf)

    # Get keyword relevance with content and zip it
    keywordTF = hashingTF.transform(removePunctuation(keyWord))
    keywordHashValue = int(keywordTF.indices[0])
    keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])
    zippedResults = keywordRelevance.zip(documentNames)

    # print result
    print "Best document for keywords is:"
    print zippedResults.max()
开发者ID:arsoedjono,项目名称:big-data,代码行数:45,代码来源:tfidf_mod.py

示例15: test_binary_term_freqs

# 需要导入模块: from pyspark.mllib.feature import HashingTF [as 别名]
# 或者: from pyspark.mllib.feature.HashingTF import transform [as 别名]
 def test_binary_term_freqs(self):
     hashingTF = HashingTF(100).setBinary(True)
     doc = "a a b c c c".split(" ")
     n = hashingTF.numFeatures
     output = hashingTF.transform(doc).toArray()
     expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                   hashingTF.indexOf("b"): 1.0,
                                   hashingTF.indexOf("c"): 1.0}).toArray()
     for i in range(0, n):
         self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                ": expected " + str(expected[i]) + ", got " + str(output[i]))
开发者ID:Brett-A,项目名称:spark,代码行数:13,代码来源:test_feature.py


注:本文中的pyspark.mllib.feature.HashingTF.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。