本文整理匯總了Python中pyspark.ml.feature.HashingTF.transform方法的典型用法代碼示例。如果您正苦於以下問題:Python HashingTF.transform方法的具體用法?Python HashingTF.transform怎麽用?Python HashingTF.transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.feature.HashingTF
的用法示例。
在下文中一共展示了HashingTF.transform方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: textPredict
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def textPredict(request):
"""6.文本聚類,熱度預測"""
label = request.POST['label']
title = request.POST['title']
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
"""處理數據集,生成特征向量"""
dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
print(features_label)
"""決策樹模型培訓"""
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
model = pipeline.fit(trainingData)
"""模型測試"""
predictions = model.transform(testData)
predictions.show()
predictions.select("prediction", "indexedLabel", "features").show(5)
"""用戶數據測試,單個新聞測試"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
print("==================================================")
myprediction.show()
resultList = convertDfToList(myprediction)
"""模型評估"""
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
treeModel = model.stages[2]
print(treeModel)
sc.stop()
return render(request,{'resultList':resultList})
示例2: tf_idf_feature
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def tf_idf_feature(wordsData):
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "id").take(3):
print(features_label)
示例3: term_frequency
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def term_frequency(df, column):
"""
Compute term-frequency of a token contained in a column.
Transformation: array<string> --> vector
"""
tf = HashingTF(inputCol=column, outputCol='_'+column)
df = tf.transform(df)
df = replace(df, column, '_'+column)
return df
示例4: extract_tf_features
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def extract_tf_features(p_df, input_col, output_col):
"""
Extracts TF features.
:param p_df: A DataFrame.
:param in_column: Name of the input column.
:param out_column: Name of the output column.
:return: A DataFrame.
"""
hashingTF = HashingTF(inputCol=input_col, outputCol=output_col, numFeatures=3000)
return hashingTF.transform(p_df)
示例5: tfidf
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n):
global idfModel
hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n)
featurizedData = hashingTF.transform(dataframe)
idf = IDF(inputCol=in_col2, outputCol=out_col2)
idfModel = idf.fit(featurizedData)
dataframe = idfModel.transform(featurizedData)
return dataframe
示例6: run_tf_idf_spark_ml
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
return idfModel.transform(featurizedData)
示例7: predictLabel
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def predictLabel(label,title,model):
"""預測新聞的標簽"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
return myprediction
示例8: test_apply_binary_term_freqs
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def test_apply_binary_term_freqs(self):
df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
n = 10
hashingTF = HashingTF()
hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
output = hashingTF.transform(df)
features = output.select("features").first().features.toArray()
expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray()
for i in range(0, n):
self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
": expected " + str(expected[i]) + ", got " + str(features[i]))
示例9: tf_feature_vectorizer
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def tf_feature_vectorizer(df,no_of_features,ip_col):
#from pyspark.sql.functions import udf
#from pyspark.sql.types import *
output_raw_col = ip_col+"raw_features"
output_col = ip_col+"features"
hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
featurizedData = hashingTF.transform(df)
idf = IDF(inputCol=output_raw_col, outputCol=output_col)
idfModel = idf.fit(featurizedData)
rescaled_data = idfModel.transform(featurizedData)
rescaled_data.show(5)
print(rescaled_data.count())
return rescaled_data
示例10: create_features
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def create_features(raw_data):
#Create DataFrame
data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
#Transform sentence into words
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
words_df = tokenizer.transform(data_df)
#Calculate term frequency
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
featurized_df = hashingTF.transform(words_df)
#Calculate inverse document frequency
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featurized_df)
return idfModel.transform(featurized_df)
示例11: makeTFIDF
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def makeTFIDF(sc, spark, reviews):
# count vectorizer and tfidf
# cv = CountVectorizer(inputCol='words_clean', outputCol='tf')
# cvModel = cv.fit(reviews)
# reviews = cvModel.transform(reviews)
# HashingTF for fewer dimensions:
hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000)
reviews = hashingtf.transform(reviews)
# create TF-IDF matrix
idf = IDF().setInputCol('tf').setOutputCol('tfidf')
tfidfModel = idf.fit(reviews)
reviews = tfidfModel.transform(reviews)
示例12: append_tf_idf
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def append_tf_idf(self, df):
"""
Calculate term frequency and inverse document frequency
based on at least 1 visit hourly in this case. Compares how often the tokens appeared
at least once per hour compared to other tokens. Not used for the main purpose of the project.
Args:
:param df: Dataframe parameter.
Returns:
:return: Dataframe with term frequency and inverse document frequency added in the columns
'rawFeatures' and 'features' respectively.
"""
#Create TF column.
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000)
tf = hashingTF.transform(df)
tf.persist(StorageLevel.MEMORY_AND_DISK)
#Create IDF column.
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf)
tfidf = idfModel.transform(tf)
return tfidf
示例13: BeautifulSoup
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
review_text = BeautifulSoup(raw_review).text
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 5. Join the words back into one string separated by space,
# and return the result.
return " ".join( meaningful_words)
stops = set(stopwords.words("english"))
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))
review = parts.map(lambda p: Row(id=p[0], label=float(p[1]),
review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
示例14: set
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
from pyspark.sql import Row
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
df = spark.read.load('/home/manh/Documents/data/result_pre.parquet')
df = df.select('id', 'stemmed')
rdd = df.select('stemmed').rdd
pre_idf = rdd.map(lambda x: set(x[0])).flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
pre_idf_collect = pre_idf.collect()
rdd_words = pre_idf.map(lambda x: Row(word=[x[0]]))
df_words = spark.createDataFrame(rdd_words)
hashingTF = HashingTF(inputCol="word", outputCol="rawFeatures", numFeatures=100000)
featurizedData = hashingTF.transform(df_words)
featurizedData.rdd.map(lambda x: (x.word[0], x['rawFeatures'].indices[0])).map(lambda x: '%s %s' % (x)).collect()
示例15: main
# 需要導入模塊: from pyspark.ml.feature import HashingTF [as 別名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 別名]
def main(sc, sqlContext):
start = timer()
stpwrds = stopwords.words('english')
tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))
print '---Pegando produtos---'
start_i = timer()
productRDD = sc.parallelize(findProductsByCategory([]))
print '####levou %d segundos' % (timer()-start_i)
print '---Criando corpus---'
start_i = timer()
corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
.map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
.cache())
print '####levou %d segundos' % (timer()-start_i)
print '---Pegando e persistindo dados de categoria e tokens---'
start_i = timer()
tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()
numTokens = len(tokens)
category = productRDD.map(lambda x: x[2]).distinct().collect()
categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
insertTokensAndCategories(tokens, category, categoryAndSubcategory)
print '####levou %d segundos' % (timer()-start_i)
print '---Calculando TF-IDF dos produtos---'
start_i = timer()
wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3]))
#persistir isso para que ele nao tenha que fazer de novo na predicaoo
wordsDataDF = sqlContext.createDataFrame(wordsData)
#persistindo para a predicao
wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2]))
#persistir isso para que ele nao tenha que fazer de novo na predicaoo
wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction)
if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"):
shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")
wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
idf = IDF(inputCol="rawFeatures", outputCol="features")
featurizedData = hashingTF.transform(wordsDataDF)
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
#VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features))
VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features))
VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
print '####levou %d segundos' % (timer()-start_i)
print '--Criando modelo Naive Bayes---'
start_i = timer()
model = NaiveBayes.train(VSMTrain)
if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"):
shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria")
model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
print '####levou %d segundos' % (timer()-start_i)
print '---Testando modelo Naive Bayes---'
start_i = timer()
prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)]))
acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count())
print 'acuracidade de %f' % acuraccy
print '####levou %d segundos' % (timer()-start_i)
print '---Pegando os posts---'
start_i = timer()
posts = list()
wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx')
sheet = wb['Menes']
for row in sheet.iter_rows(row_offset=1):
post = list()
for cell in row:
if cell.value is None:
break
post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value))
if len(post) > 0:
posts.append(tuple(post))
print '####levou %d segundos' % (timer()-start_i)
print '---Criando corpus---'
start_i = timer()
postsRDD = sc.parallelize(posts)
postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower())))
.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds]))
.map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP']))
.cache())
#.........這裏部分代碼省略.........