本文整理汇总了Python中pyspark.SparkContext.wholeTextFiles方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.wholeTextFiles方法的具体用法?Python SparkContext.wholeTextFiles怎么用?Python SparkContext.wholeTextFiles使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.wholeTextFiles方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def main():
parser = argparse.ArgumentParser('description')
parser.add_argument('--input', '-i', help='Input file or folder')
parser.add_argument('--output', '-o', help='Output file')
parser.add_argument('--lemmatize', '-l', action='store_true', help='Use lemmatizations')
parser.add_argument('--pos', '-p', action='store_true', help='Use POS tags')
parser.add_argument('--mincount', '-m', type=int, help='Filter words with <= this count (makes faster)')
args = parser.parse_args()
conf = SparkConf().set("spark.executor.memory", "5G").set("spark.storage.memoryFraction", 0.1)
sc = SparkContext("local", "wordcount", conf=conf)
# read the whole files into (filename, contents) pairs
files = sc.wholeTextFiles(args.input)
# extract all the tokens
extractor = partial(process_pair, lemmatize=args.lemmatize, pos=args.pos)
wordcounts = files.flatMap(extractor).reduceByKey(add)
if args.mincount:
wordcounts = wordcounts.filter(lambda x: x[1] > args.mincount)
asStrings = wordcounts.map(lambda x: x[0] + "\t" + str(x[1]))
asStrings.saveAsTextFile(args.output)
示例2: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def main():
global spark, factors, workers, iterations, betaConst, lambdaConst
spark = SparkContext("local", "Matrix-Factorization")
factors = int(sys.argv[1])
workers = int(sys.argv[2])
iterations = int(sys.argv[3])
betaConst = float(sys.argv[4])
lambdaConst = float(sys.argv[5])
inputPath = sys.argv[6]
outputPathW = sys.argv[7]
outputPathH = sys.argv[8]
#load data
matrixParts = spark.wholeTextFiles(inputPath).map(lambda x:openFile(x)).collect()
allRows = []
allCols = []
allValues = []
for part in matrixParts:
allRows += part[0]
allCols += part[1]
allValues += part[2]
V = sparse.csr_matrix((allValues, (allRows, allCols)))
W,H = matrixFactorization(V)
writeFile(W,outputPathW)
writeFile(H,outputPathH)
示例3: raw_files_to_labeled_features
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def raw_files_to_labeled_features(raw_files, label_file):
# Initialize spark
conf = SparkConf().setAppName("SpamFilter").setMaster("local[*]")
sc = SparkContext(conf=conf)
# Get the set of words that we will be accepting as valid features
valid_words = set(w.lower() for w in words.words())
# Load training data and convert to our desired format
raw_files = sc.wholeTextFiles(raw_files)
# Extract a document of filtered words from each text file
documents = raw_files.map(lambda x: (x[0], extract_words(x[1], valid_words)))
# Calculate TF-IDF values for each document
tfidf = calculate_tfidf(documents)
# Load labels
labels = sc.parallelize(load_labels(label_file)).map(lambda x: x[0])
# Append indexes to features and labels
indexed_labels = labels.zipWithIndex().map(lambda x: (x[1],x[0]))
indexed_features = tfidf.zipWithIndex().map(lambda x: (x[1],x[0]))
# Join labels and features into tuples and return
return indexed_labels.join(indexed_features).map(lambda x: x[1]).collect()
示例4: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def main(arglist):
with open("log_file_v.txt", "a") as f:
f.write("Start time of validation...... %s\n" % datetime.datetime.now())
print("Start time of validation...... %s" % datetime.datetime.now())
# mapreduce params
output = arglist[0]
minPartitions = int(arglist[1])
# initialize
sc = SparkContext(appName="PythonValidate")
# rdd = sc.textFile(output_file_name, minPartitions=minPartitions)
rdd = sc.wholeTextFiles(output, minPartitions=minPartitions)
print('partitions', rdd.getNumPartitions())
error_count = rdd.mapPartitions(separateBlocks).sum()
sc.stop()
print("End time of validation...... %s" % datetime.datetime.now())
with open("log_file_v.txt", "a") as f:
f.write("End time of validation...... %s\n" % datetime.datetime.now())
f.write("Error count of sorted file...... %s" % error_count)
f.close()
示例5: __init__
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
class SparkBlock:
def __init__(self):
# self.conf = SparkConf().set("spark.executor.memory", "500M").set("spark.driver.memory", "2g")
self.logFile = "logfile" # Should be some file on your system
self.sc = SparkContext(appName="SparkBlock Analysis")
with open("/root/credentials/creds.txt") as f:
self.ACCESS_KEY = f.readline().strip()#.replace("/", "%2F")
self.SECRET_KEY = f.readline().strip()#.replace("/", "%2F")
hconf = self.sc._jsc.hadoopConfiguration()
hconf.set("fs.s3n.awsAccessKeyId", self.ACCESS_KEY)
hconf.set("fs.s3n.awsSecretAccessKey", self.SECRET_KEY)
self.chain = None
self.logger = self.sc._jvm.org.apache.log4j.LogManager
def no_warn(self):
self.logger.getLogger("org").setLevel( logger.Level.ERROR )
self.logger.getLogger("akka").setLevel( logger.Level.ERROR )
def read_block_dir(self, uri):
blocks = self.sc.wholeTextFiles(uri)
blockobjs = blocks.map(lambda (n,c): (n, Block.of_string(c.strip(),0)[0]))
return blockobjs
def get_files(self, uri):
from boto.s3.connection import S3Connection
conn = S3Connection(self.ACCESS_KEY, self.SECRET_KEY)
bucket = conn.get_bucket(uri)
for key in bucket.list():
yield key
def fetch_chain(self):
if self.chain is None:
self.chain = sb.sc.binaryFiles(
os.environ["HDFS_URL"]+"/media/ephemeral0/blocks/"
).flatMap(lambda (name,blk): Blocks.of_buffer(blk,0,len(blk), err=name))
return self.chain
示例6: search
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def search(input_path, search_string):
sc = SparkContext("local", "Inverted Index")
sqlc = SQLContext(sc)
# build the inverted index table and push to db
rdd = (
sc.wholeTextFiles(input_path)
.flatMap(lambda (name, content): map(lambda word: (word, name), content.split()))
.map(lambda (word, name): ((word, name), 1))
.reduceByKey(lambda count1, count2: count1 + count2)
.map(lambda ((word, name), count): (word, name, count))
)
开发者ID:linearregression,项目名称:Spark-SQL-Inverted-Index-Search-Engine,代码行数:14,代码来源:inverted+index+search+sql.py
示例7: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def main():
conf = SparkConf().setAppName("Test2")
sc = SparkContext(conf=conf)
# new_dict函数将<tuple,value>键值对转换成<tuple_1,dict(tuple_2,value)>键值对
def new_dict(line):
Dict = dict()
Dict[line[0][1]] = line[1]
return (line[0][0], Dict)
# 读取原始文件,形成<文件,内容>的键值对
data_raw = sc.wholeTextFiles("/home/djt/data/proclassified")
# Doc函数将<文件,内容>键值对中内容按行split,每一行即对应一封判决书的内容
def Doc(line):
s = line[1].split("\n")
return s[0:len(s) - 1]
# <文件,内容>的键值对 => <判决书路径,判决书内容>键值对
data = data_raw.flatMap(Doc)
# 将判决书路径 => ID
def DocID(string):
s = filter(lambda x: x.isdigit(), string)
return s[1:len(s)]
# <判决书路径,判决书内容> => <判决书ID,判决书内容>
data_wordsplit = data.map(lambda line: (DocID(line.split(",<")[0]), line.split(",<")[1].split(" ")))
# 去除分词后文本之间的空格,便于后续正则表达式匹配
def Doc_Integration(line):
doc = ""
for k in line[1]:
doc += k
return (line[0], doc)
# <判决书ID,判决书内容(有空格)> => <判决书ID,判决书内容>
data_doc = data_wordsplit.map(Doc_Integration)
# 从keywords_body.txt中提取出各可能维度,用正则表达式编译
keywords_raw = sc.textFile("/home/djt/data/keywords_crime.txt")
keywords = keywords_raw.map(
lambda line: re.compile(line)).collect()
# 将<维度,set(特征词)>键值对广播
keywords = sc.broadcast(keywords)
# 正则表达式匹配各判决书中出现的所有腐败行为方式(即罪名)
def keywords_stats(line):
doc = line[1]
# 匹配 doc是判决书内容 value[0]即正则表达式
temp = keywords.value[0].findall(doc)
crime_set = set(temp)
crime = ""
for k in crime_set:
crime+="\t"+k
return (line[0],crime)
# raw:<判决书ID,所有出现的行为方式(罪名)>
raw = data_doc.map(keywords_stats)
after = raw.sortByKey()
# 输出
res = after.map(lambda (k, v): k + "\t" + v)
res.saveAsTextFile("/home/djt/data/out")
示例8: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def main(argv):
sc = SparkContext(appName="KaggleDato")
#parse labels as JSON
PATH_TO_TRAIN_LABELS = "file:///scratch/network/alexeys/KaggleDato/train_v2.csv"
train_label_rdd = sc.textFile(PATH_TO_TRAIN_LABELS).filter(lambda x: 'file' not in x).map(lambda x: parse_input(x)).map(lambda x: json.dumps(x)).repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/train_csv_json')
nbuckets = 1
for bucket in range(nbuckets):
for section in range(1,2):
print "Processing bucket ",bucket," section ", section
fIn_rdd = sc.wholeTextFiles("file:///scratch/network/alexeys/KaggleDato/"+str(bucket)+"/"+str(section)+"*_raw_html.txt",10).map(parse_page_rdd).map(lambda x: json.dumps(x))
fIn_rdd.repartition(1).saveAsTextFile('/user/alexeys/KaggleDato/'+str(bucket)+'_'+str(section)+'/')
示例9: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def main(argv):
logging.config.fileConfig(os.path.join(os.path.dirname(os.path.realpath(__file__)), "logging.ini"))
parsed_args = parse_args(argv)
spark_conf = SparkConf()
sc = SparkContext(conf=spark_conf)
with open(parsed_args.config) as in_config:
preprocess_conf = json.load(in_config)
if preprocess_conf.get("binary_input", True):
files = sc.binaryFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000))
else:
files = sc.wholeTextFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000))
files = files.repartition(preprocess_conf.get('partitions', 4000))
metadata = parse_metadata(preprocess_conf["labeled"]["metadata"])
labeled = sc.textFile(preprocess_conf["labeled"]["file"], preprocess_conf.get('partitions', 4000)).\
map(lambda x: parse_labeled_line(x, metadata, True)).filter(lambda x: x.iloc[0]["label"] != 4).map(transform_labels)
header, resampled = prep.preprocess(sc, files, labeled, label=preprocess_conf.get('label', True),
cut=preprocess_conf.get("cut", {"low": 6300, "high": 6700}),
pca=preprocess_conf.get("pca", None), partitions=preprocess_conf.get('partitions', 100))
resampled.map(lambda x: x.to_csv(None, header=None).rstrip("\n")).saveAsTextFile(preprocess_conf["output"])
示例10: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def main():
### Initialize the SparkConf and SparkContext
### Locations of Python files.
sheets_loc = "/root/IdeaNets/Synapsify/Synapsify/loadCleanly/sheets.py"
lstm_class_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/lstm_class.py"
load_params_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/load_params.py"
preprocess_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/synapsify_preprocess.py"
### Pass Python files to Spark.
pyFiles = []
pyFiles.append(sheets_loc)
pyFiles.append(lstm_class_loc)
pyFiles.append(load_params_loc)
pyFiles.append(preprocess_loc)
### Automatically get the master node url from AWS, normally it is fixed.
cmd = ["./../../spark/ec2/spark-ec2", "-r", "us-east-1", "get-master", "ruofan-cluster"]
hostname = (
subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].split("\n")[2]
) ### host name of the master node.
master_url = ""
master_url += "spark://"
master_url += hostname
master_url += ":7077"
# print master_url
### Initialize the spark configuration.
conf = SparkConf().setAppName("ruofan").setMaster(master_url)
sc = SparkContext(conf=conf, pyFiles=pyFiles)
### Add non-python files passing to Spark.
sc.addFile("/root/spark/bin/nonbreaking_prefix.en")
sc.addFile("/root/IdeaNets/IdeaNets/models/lstm/scode/tokenizer.perl")
sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/stopwords.txt")
sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/prepositions.txt")
datafile = sc.wholeTextFiles(
"s3n://synapsify-lstm/Synapsify_data1", use_unicode=False
) ### Read data directory from S3 storage.
### Sent the application in each of the slave node
datafile.foreach(lambda (path, content): lstm_test(path, content))
示例11: search
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
def search(input_path, search_string):
sc = SparkContext("local", "Inverted Index")
# build the inverted index
inverted_index_rdd = sc.wholeTextFiles(input_path)\
.flatMap(lambda (name, content): map(lambda word: (word, name), content.split()))\
.map(lambda (word, name): ((word, name), 1))\
.reduceByKey(lambda count1, count2: count1 + count2)\
.map(lambda ((word, name), count): (word, (name, count)))\
.groupByKey()
search_keywords = set(search_string.split())
# query the result from inverted index
result_rdd = inverted_index_rdd.filter(lambda (word, name_count_list): word in search_keywords)\
.flatMap(lambda (word, name_count_list): name_count_list)\
.reduceByKey(lambda count1, count2: count1 + count2)\
.sortBy((lambda (name, count): count), False)\
.map(lambda (name, count): name + " has number of hits: " + `count`)
#print the result
for line in result_rdd.collect():
print line
sc.stop()
开发者ID:linearregression,项目名称:Spark-SQL-Inverted-Index-Search-Engine,代码行数:26,代码来源:inverted+index+search.py
示例12: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
__author__ = 'andy'
import json
import pandas as pd
from pyspark import SparkContext
# import sklearn.preprocessing
import matplotlib.pyplot as plt
# create sc
sc = SparkContext('local', 'minute_bar')
# load data
path = '/Users/guoqiangzhang/PycharmProjects/sparktest/data/minute_bar'
rdd_mkt_data = sc.wholeTextFiles(path, minPartitions=80) \
.setName('index_minute_bar') \
.cache()
"""
# deal data
"""
# 1.UDF, 从rdd_mkt_data获取指定要预测的分钟线
def minute_bar_index(line_id):
line_data = rdd_mkt_data.filter(lambda x: line_id in x[0]).collect()
line = pd.DataFrame.from_dict(json.loads(line_data[0][1]))
line.sort(columns=['barTime'], ascending=True, inplace=True)
return line
# 指定想要预测的线的 id,这里我们预测上证指数 2016.03.17 的分钟线
示例13: return
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
weight = 0.01
else:
weight = 10000
return ((student_ID, class_ID), grades*weight)
if __name__ == '__main__':
sc = SparkContext("local[*]", appName="cal_exam_grades")
sc.setLogLevel("WARN")
#classes = sc.wholeTextFiles(class_folder).map(file_mapper_for_class)
#exams = sc.wholeTextFiles(exam_folder).map(file_mapper_for_exam)
exam_students_class_pair = sc.wholeTextFiles(class_folder).flatMap(m1)
exam_assignments_pair = sc.wholeTextFiles(exam_folder).map(m2)
exam_students_class_assignments_pair = exam_students_class_pair.join(exam_assignments_pair)
student_assignment_class = exam_students_class_assignments_pair.flatMap(m3)
cooked_grades = sc.textFile("./intermediate_data/"+str(year)+"_submission_group_and_analyzed_and_ranked/*").map(cooked_submission_line_mapper)
k_assignment_v_student_class_grades = student_assignment_class.join(cooked_grades).map(set_key_to_aid)
k_exerciseID_v_assignment = sc.wholeTextFiles(assignment_folder).map(purposed_assignment_file_mapper)
k_exerciseID_v_exerciseType = sc.wholeTextFiles(exercise_folder).map(purposed_exercise_file_mapper)
示例14: SparkConf
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
from pyspark.mllib.feature import HashingTF,IDF
from pyspark import SparkConf,SparkContext
conf = SparkConf().setMaster("local").setAppName("big_data")
sc = SparkContext(conf=conf)
dirinput = "../bigdata/hw1/wikiset"
diroutput = "../bigdata/hw1/wikitfidf"
rdd = sc.wholeTextFiles(dirinput).map(lambda (name,text):text.split())
tf = HashingTF()
tfVectors = tf.transform(rdd).cache()
idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)
tfIdfVectors.saveAsTextFile(diroutput)
示例15: len
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import wholeTextFiles [as 别名]
# Check input args and print usage instructions if incorrect.
if len(sys.argv) < 2:
print(
'\nUsage: > spark-submit file_merge.py <input dir of files> <final output file name>\n\n'
)
exit(-1)
# Write a text file
def write_file(text, filename):
f = open(filename, "w")
f.write(text)
f.close()
# Read in command line args
input_dir = sys.argv[1]
output_file = sys.argv[2]
# Set up Spark context
conf = SparkConf().setAppName("File Merger")
sc = SparkContext(conf=conf)
input_files = sc.wholeTextFiles(input_dir) # Read in files
# Transform files into a single one:
final_content = input_files.map(lambda (doc, content): content.strip()).reduce(
lambda x, y: x + "\n" + y)
write_file(final_content, output_file) # save final content into output file
sc.stop()