本文整理汇总了Python中pyspark.ml.feature.Tokenizer.transform方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.transform方法的具体用法?Python Tokenizer.transform怎么用?Python Tokenizer.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.feature.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: textPredict
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def textPredict(request):
"""6.文本聚类,热度预测"""
label = request.POST['label']
title = request.POST['title']
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
"""处理数据集,生成特征向量"""
dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
print(features_label)
"""决策树模型培训"""
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
model = pipeline.fit(trainingData)
"""模型测试"""
predictions = model.transform(testData)
predictions.show()
predictions.select("prediction", "indexedLabel", "features").show(5)
"""用户数据测试,单个新闻测试"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
print("==================================================")
myprediction.show()
resultList = convertDfToList(myprediction)
"""模型评估"""
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
treeModel = model.stages[2]
print(treeModel)
sc.stop()
return render(request,{'resultList':resultList})
示例2: main
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def main():
spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate()
args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
'S3_INPUT_KEY_PREFIX',
'S3_OUTPUT_BUCKET',
'S3_OUTPUT_KEY_PREFIX',
'S3_MODEL_BUCKET',
'S3_MODEL_KEY_PREFIX'])
# This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
"org.apache.hadoop.mapred.FileOutputCommitter")
# Defining the schema corresponding to the input data. The input data does not contain the headers
schema = StructType([StructField("label", IntegerType(), True),
StructField("title", StringType(), True),
StructField("abstract", StringType(), True)])
# Download the data from S3 into two separate Dataframes
traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
'train.csv')), header=False, schema=schema, encoding='UTF-8')
validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
'test.csv')), header=False, schema=schema, encoding='UTF-8')
# Tokenize the abstract column which contains the input text
tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract")
# Save transformed training data to CSV in S3 by converting to RDD.
transformed_traindf = tokenizer.transform(traindf)
transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract))
lines = transformed_train_rdd.map(csv_line)
lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))
# Similar data processing for validation dataset.
transformed_validation = tokenizer.transform(validationdf)
transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract))
lines = transformed_validation_rdd.map(csv_line)
lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))
# Serialize the tokenizer via MLeap and upload to S3
SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation)
# Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
import zipfile
with zipfile.ZipFile("/tmp/model.zip") as zf:
zf.extractall("/tmp/model")
# Write back the content as a .tar.gz file
import tarfile
with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
tar.add("/tmp/model/bundle.json", arcname='bundle.json')
tar.add("/tmp/model/root", arcname='root')
s3 = boto3.resource('s3')
file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
示例3: token
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def token(dataframe, in_col, out_col):
tokenizer = Tokenizer(inputCol=in_col, outputCol=out_col)
dataframe = tokenizer.transform(dataframe)
dataframe.printSchema()
return dataframe
示例4: run_tf_idf_spark_ml
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
return idfModel.transform(featurizedData)
示例5: predictLabel
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def predictLabel(label,title,model):
"""预测新闻的标签"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
return myprediction
示例6: create_features
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def create_features(raw_data):
#Create DataFrame
data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
#Transform sentence into words
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
words_df = tokenizer.transform(data_df)
#Calculate term frequency
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
featurized_df = hashingTF.transform(words_df)
#Calculate inverse document frequency
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featurized_df)
return idfModel.transform(featurized_df)
示例7: preprocessing_titles
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def preprocessing_titles(path,name):
query = preprocessData(path)
tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
wordsData = tokenizer.transform(query)
#after Stopword removal
remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered")
wordsData= remover.transform(wordsData)
df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"])
df.registerTempTable("indices")
wordsData.registerTempTable("words")
qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id")
if name!='':
exportOnS3(qr,"s3a://redit-preprocessed/",name)
qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
示例8: get_top_words
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def get_top_words(dataset, signatures):
# TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
# Or translate comments in other languages using the free Microsoft Translate API.
sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))
if sentenceData.rdd.isEmpty():
return dict()
# Tokenize comments.
tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
wordsData = tokenizer.transform(sentenceData)
# Remove duplicate words from comments.
wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])
if wordsData.rdd.isEmpty():
print("[WARNING]: wordsData is empty, sentenceData wasn't.")
return dict()
# Clean comment words by removing puntuaction and stemming.
def clean_word(w):
return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))
wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])
# XXX: Useless with TF-IDF?
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
cleanWordsData = remover.transform(wordsData)
cv = CountVectorizer(inputCol='filtered', outputCol='features')
model = cv.fit(cleanWordsData)
featurizedData = model.transform(cleanWordsData)
idf = IDF(inputCol='features', outputCol='tfidf_features')
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()
return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
示例9: run_tf_idf_spark_mllib
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(df)
words = wordsData.select("words").rdd.map(lambda x: x.words)
hashingTF = MllibHashingTF(numFeatures)
tf = hashingTF.transform(words)
tf.cache()
idf = MllibIDF().fit(tf)
tfidf = idf.transform(tf)
# @TODO make this nicer
tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
tmp.registerTempTable("tmp")
old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
tmp.registerTempTable("tmp")
old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
示例10: Tokenizer
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
from pyspark.ml.feature import Tokenizer
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("tokenizer_sample") \
.master("local[*]") \
.getOrCreate()
data = [(0, "Tokenization is the process"), (1, "Refer to the Tokenizer")]
inputDF = spark.createDataFrame(data).toDF("id", "input")
tokenizer = Tokenizer(inputCol="input", outputCol="output")
outputDF = tokenizer.transform(inputDF)
outputDF.printSchema()
outputDF.show()
spark.stop
示例11: StringIndexer
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
#LOADING DATA FROM HDFS TO SPARK DATAFRAME
df0=spark.read.option("sep", "\t").option('header',True).csv('hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv')
df0.printSchema()
#FILTERING FOR EMPTY VALUES
df01 = df0.filter((col("review_body").isNotNull()) & (col("verified_purchase").isNotNull()))
#ENCODING LABEL
stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res")
ppl = Pipeline(stages=[stage_string])
df1 = ppl.fit(df01).transform(df01)
#CREATING TF_IDF
tokenizer = Tokenizer(inputCol="review_body", outputCol="words")
wordsData = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
#NAIVEBAYES
nb = NaiveBayes(featuresCol="features", labelCol="class_res")
#Model training
model = nb.fit(rescaledData)
#Model Saving
model.write().overwrite().save("./NB_model")
示例12: BeautifulSoup
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
review_text = BeautifulSoup(raw_review).text
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 5. Join the words back into one string separated by space,
# and return the result.
return " ".join( meaningful_words)
stops = set(stopwords.words("english"))
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))
review = parts.map(lambda p: Row(id=p[0], label=float(p[1]),
review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
示例13: SparkContext
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
# limitations under the License.
#
from __future__ import print_function
from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="TokenizerExample")
sqlContext = SQLContext(sc)
# $example on$
sentenceDataFrame = sqlContext.createDataFrame([
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsDataFrame = tokenizer.transform(sentenceDataFrame)
for words_label in wordsDataFrame.select("words", "label").take(3):
print(words_label)
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)
# $example off$
sc.stop()
示例14: Row
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
parts = lines.map(lambda l: l.split(","))
f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1))
linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt")
partst = linest.map(lambda l: l.split(","))
ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0))
alldata = f.union(ft)
schemaApp = sqlContext.createDataFrame(alldata)
schemaApp.registerTempTable("data")
tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms")
permsData = tokenizer.transform(schemaApp)
hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures")
featurizedData = hashingTF.transform(permsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)
labelsAndPreds = wordsvectors.map(lambda p: (p.label, model.predict(p.features)))
示例15: time
# 需要导入模块: from pyspark.ml.feature import Tokenizer [as 别名]
# 或者: from pyspark.ml.feature.Tokenizer import transform [as 别名]
print "Create dataframe"
t0 = time()
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
print "Showing first example : "
print
print df.first()
tt = time() - t0
print
print "Dataframe created in {} second".format(round(tt,3))
# In[314]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTok = tokenizer.transform(df)
# In[315]:
from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTok)
# In[317]:
print "Start tokenizing, computing bigrams and splitting between test and train"
t0 = time()
dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2])
dfTrain.take(1)