当前位置: 首页>>代码示例>>Python>>正文


Python HashingTF.transform方法代码示例

本文整理汇总了Python中pyspark.ml.feature.HashingTF.transform方法的典型用法代码示例。如果您正苦于以下问题:Python HashingTF.transform方法的具体用法?Python HashingTF.transform怎么用?Python HashingTF.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.feature.HashingTF的用法示例。


在下文中一共展示了HashingTF.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: textPredict

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:60,代码来源:views.py

示例2: tf_idf_feature

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def tf_idf_feature(wordsData):
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    for features_label in rescaledData.select("features", "id").take(3):
        print(features_label)
开发者ID:wingsrc,项目名称:benchmark_minhash_lsh,代码行数:10,代码来源:preprocessing.py

示例3: term_frequency

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def term_frequency(df, column):
    """
    Compute term-frequency of a token contained in a column.
    Transformation: array<string> --> vector
    """ 
    tf = HashingTF(inputCol=column, outputCol='_'+column)
    df = tf.transform(df)
    
    df = replace(df, column, '_'+column)
    return df
开发者ID:ribonj,项目名称:lsir,代码行数:12,代码来源:ml.py

示例4: extract_tf_features

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def extract_tf_features(p_df, input_col, output_col):
    """
    Extracts TF features.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.    
    """
    hashingTF = HashingTF(inputCol=input_col, outputCol=output_col, numFeatures=3000)
    return hashingTF.transform(p_df)
开发者ID:rhasan,项目名称:machine-learning,代码行数:12,代码来源:Quora.py

示例5: tfidf

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n):

    global idfModel
    
    hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n)
    featurizedData = hashingTF.transform(dataframe)
    idf = IDF(inputCol=in_col2, outputCol=out_col2)
    idfModel = idf.fit(featurizedData)
    dataframe = idfModel.transform(featurizedData)
    
    return dataframe
开发者ID:rjshanahan,项目名称:Text_Analytics_Topic_Modelling,代码行数:13,代码来源:topic_modelling_scikit.py

示例6: run_tf_idf_spark_ml

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)
开发者ID:ctavan,项目名称:bbuzz2016,代码行数:13,代码来源:bbuzz2016-backup.py

示例7: predictLabel

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def predictLabel(label,title,model):
    """预测新闻的标签"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    return myprediction
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:14,代码来源:desionTree.py

示例8: test_apply_binary_term_freqs

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
    def test_apply_binary_term_freqs(self):

        df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
        n = 10
        hashingTF = HashingTF()
        hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
        output = hashingTF.transform(df)
        features = output.select("features").first().features.toArray()
        expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray()
        for i in range(0, n):
            self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
                                   ": expected " + str(expected[i]) + ", got " + str(features[i]))
开发者ID:Brett-A,项目名称:spark,代码行数:14,代码来源:test_feature.py

示例9: tf_feature_vectorizer

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def tf_feature_vectorizer(df,no_of_features,ip_col):
    #from pyspark.sql.functions import udf
    #from pyspark.sql.types import *
    output_raw_col = ip_col+"raw_features"
    output_col = ip_col+"features"
    hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
    featurizedData = hashingTF.transform(df)
    idf = IDF(inputCol=output_raw_col, outputCol=output_col)
    idfModel = idf.fit(featurizedData)
    rescaled_data = idfModel.transform(featurizedData)
    rescaled_data.show(5)
    print(rescaled_data.count())
    return rescaled_data
开发者ID:vikaasa,项目名称:Spark_Workshop,代码行数:15,代码来源:sparking_your_interest.py

示例10: create_features

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)
开发者ID:DataLAUSDEclassProject,项目名称:spark,代码行数:15,代码来源:spark_cluster.py

示例11: makeTFIDF

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def makeTFIDF(sc, spark, reviews):
    # count vectorizer and tfidf
    # cv = CountVectorizer(inputCol='words_clean', outputCol='tf')
    # cvModel = cv.fit(reviews)
    # reviews = cvModel.transform(reviews)

    # HashingTF for fewer dimensions:
    hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000)
    reviews = hashingtf.transform(reviews)

    # create TF-IDF matrix
    idf = IDF().setInputCol('tf').setOutputCol('tfidf')
    tfidfModel = idf.fit(reviews)
    reviews = tfidfModel.transform(reviews)
开发者ID:sam46,项目名称:Yelper,代码行数:16,代码来源:project.py

示例12: append_tf_idf

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
 def append_tf_idf(self, df):
     """
     Calculate term frequency and inverse document frequency
      based on at least 1 visit hourly in this case. Compares how often the tokens appeared
      at least once per hour compared to other tokens. Not used for the main purpose of the project.
     Args:
         :param df: Dataframe parameter.
     Returns:
         :return:  Dataframe with term frequency and inverse document frequency added in the columns
                     'rawFeatures' and 'features' respectively.
     """
     #Create TF column.
     hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000)
     tf = hashingTF.transform(df)
     tf.persist(StorageLevel.MEMORY_AND_DISK)
     #Create IDF column.
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idfModel = idf.fit(tf)
     tfidf = idfModel.transform(tf)
     return tfidf
开发者ID:ari99,项目名称:wiki_stats,代码行数:22,代码来源:operations.py

示例13: BeautifulSoup

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
开发者ID:rbkasat,项目名称:CSYE7374_FinalProject,代码行数:33,代码来源:RandomForest_TF-IDF.py

示例14: set

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
from pyspark.sql import Row
from pyspark.ml.feature import HashingTF, IDF, Tokenizer


df = spark.read.load('/home/manh/Documents/data/result_pre.parquet')
df = df.select('id', 'stemmed')
rdd =  df.select('stemmed').rdd
pre_idf = rdd.map(lambda x: set(x[0])).flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
pre_idf_collect =  pre_idf.collect()

rdd_words = pre_idf.map(lambda x: Row(word=[x[0]]))

df_words = spark.createDataFrame(rdd_words)

hashingTF = HashingTF(inputCol="word", outputCol="rawFeatures", numFeatures=100000)

featurizedData = hashingTF.transform(df_words)

featurizedData.rdd.map(lambda x: (x.word[0], x['rawFeatures'].indices[0])).map(lambda x: '%s  %s' % (x)).collect()
开发者ID:manhcompany,项目名称:manhdoi,代码行数:21,代码来源:idf_f.py

示例15: main

# 需要导入模块: from pyspark.ml.feature import HashingTF [as 别名]
# 或者: from pyspark.ml.feature.HashingTF import transform [as 别名]
def main(sc, sqlContext):
    start = timer()

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    print '---Pegando produtos---'
    start_i = timer()
    productRDD = sc.parallelize(findProductsByCategory([]))
    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
                           .cache())
    print '####levou %d segundos' % (timer()-start_i)

    print '---Pegando e persistindo dados de categoria e tokens---'
    start_i = timer()
    tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()
    numTokens = len(tokens)
    category = productRDD.map(lambda x: x[2]).distinct().collect()
    categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
    insertTokensAndCategories(tokens, category, categoryAndSubcategory)
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Calculando TF-IDF dos produtos---'
    start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataDF = sqlContext.createDataFrame(wordsData)   

    #persistindo para a predicao
    wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction)   

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")

    wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") 

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features))
    VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features))

    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)    


    print '--Criando modelo Naive Bayes---'
    start_i = timer()
    model = NaiveBayes.train(VSMTrain)

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria")

    model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Testando modelo Naive Bayes---'
    start_i = timer()
    prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)]))
    acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count())
    print 'acuracidade de %f' % acuraccy
    print '####levou %d segundos' % (timer()-start_i)    
    
    print '---Pegando os posts---'

    start_i = timer()
    posts = list()
    wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx')
    sheet = wb['Menes']
    for row in sheet.iter_rows(row_offset=1):
        post = list()
        for cell in row:
            if cell.value is None:
                break
            post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value))

        if len(post) > 0:            
            posts.append(tuple(post))

    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    postsRDD = sc.parallelize(posts)
    postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower())))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds]))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP']))
                           .cache())
#.........这里部分代码省略.........
开发者ID:felipecontra3,项目名称:recsys-tcc-ml,代码行数:103,代码来源:train_classifier.py


注:本文中的pyspark.ml.feature.HashingTF.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。