本文整理汇总了Python中pyspark.SparkContext.mongoRDD方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.mongoRDD方法的具体用法?Python SparkContext.mongoRDD怎么用?Python SparkContext.mongoRDD使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.mongoRDD方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import mongoRDD [as 别名]
def main():
print ('Start!')
conf = SparkConf().setAppName("pyspark_test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/test_database.transactions')
rdd.saveToMongoDB('mongodb://localhost:27017/test_database.transactions_copy')
print ('Completed!')
示例2: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import mongoRDD [as 别名]
def main():
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
# Create an RDD backed by the MongoDB collection.
# This RDD *does not* contain key/value pairs, just documents.
# If you want key/value pairs, use the mongoPairRDD method instead.
rdd = sc.mongoRDD('mongodb://localhost:27017/db.contextizer')
示例3: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import mongoRDD [as 别名]
def main():
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
config = ConfigParser.ConfigParser()
config.read('configuration.cfg')
mongodb_connection = config.get('BatchProperties', 'URLMongoDB')
#######################################################
# UTILIZACION DE LA LIBRERIA DE PYMONGO
#######################################################
client = MongoClient()
db = client.test
cursor = db.tabla1.find()
for document in cursor:
print(document)
#######################################################
# UTILIZACION DE LA LIBRERIA DE pymongo_spark
#######################################################
# Lectura de una tabla de mongodb (db: test; coleccion: tabla1)
rdd = sc.mongoRDD(mongodb_connection + 'test.tabla1')
# Guardamos el rdd leido en mongodb (db: test; coleccion: tabla2)
rdd.saveToMongoDB(mongodb_connection + 'test.tabla2')
# Recuperamos el valor de raiz del proyecto
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
# BASE_DIR = /Users/akash/PycharmProjects/masterbigdata
# Leemos un fichero de ejemplo
file = os.path.join(BASE_DIR + '/datasets/batch/air', 'ficheroSalidaAire.txt')
rddfFile = sqlContext.jsonFile(file)
# Almancemos en mongodb el fichero
rddfFile.saveToMongoDB(mongodb_connection + 'test.tabla3')
示例4: MongoClient
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import mongoRDD [as 别名]
start_time = time.time()
client = MongoClient('localhost',27017)
utc=pytz.UTC
#datetime.datetime.now().replace(tzinfo=utc)
db = client['disaster']
minuteAnalysisLatest = db['minute']
pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.analysisData')
#Objective 1: Get the number of times the key words(222) are used for 1 particular day , For every minute.
dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc)
incrementByAMinute = datetime.timedelta(minutes=1)
incrementByADay = datetime.timedelta(days=1)
dayOneEnd = dayOne + incrementByADay
dayOneEnd.replace(tzinfo=utc)
contentRdd = rdd.map(lambda x: (x['text'],x['created_at'])).filter(lambda (x,y): y > dayOne and y < dayOneEnd).persist()
#count = {'bomb': 0, 'violent storm': 0, 'hijacker': 0, 'bombed': 0, 'sunk': 0, 'avalanche': 0, 'debris': 0, 'body bag': 0, 'battle': 0, 'fear': 0, 'weapons': 0, 'catastrophe': 0, 'forest fire': 0, 'ruin': 0, 'buildings burning': 0, 'blaze': 0, 'fatal': 0, 'airplane accident': 0, 'sinking': 0, 'electrocute': 0, 'rescue': 0, 'hostage': 0, 'massacre': 0, 'traumatised': 0, 'trouble': 0, 'screaming': 0, 'suicide bomb': 0, 'annihilated': 0, 'loud bang': 0, 'floods': 0, 'quarantine': 0, 'obliterate': 0, 'cliff fall': 0, 'body bagging': 0, 'snowstorm': 0, 'whirlwind': 0, 'disaster': 0, 'bleeding': 0, 'razed': 0, 'famine': 0, 'armageddon': 0, 'wreck': 0, 'thunder': 0, 'wrecked': 0, 'crush': 0, 'burned': 0, 'sirens': 0, 'explosion': 0, 'screams': 0, 'rescuers': 0, 'bridge collapse': 0, 'survivors': 0, 'fatality': 0, 'earthquake': 0, 'accident': 0, 'flames': 0, 'detonate': 0, 'mass murderer': 0, 'smoke': 0, 'military': 0, 'stretcher': 0, 'blizzard': 0, 'danger': 0, 'bloody': 0, 'panicking': 0, 'drowned': 0, 'eyewitness': 0, 'devastation': 0, 'bush fires': 0, 'army': 0, 'heat wave': 0, 'emergency plan': 0, 'tragedy': 0, 'collided': 0, 'survive': 0, 'injury': 0, 'riot': 0, 'attacked': 0, 'fire': 0, 'bioterrorism': 0, 'wounds': 0, 'quarantined': 0, 'drown': 0, 'hailstorm': 0, 'casualties': 0, 'mass murder': 0, 'demolish': 0, 'collision': 0, 'pandemonium': 0, 'sandstorm': 0, 'electrocuted': 0, 'landslide': 0, 'flooding': 0, 'mayhem': 0, 'rainstorm': 0, 'demolition': 0, 'blew up': 0, 'hijacking': 0, 'siren': 0, 'terrorist': 0, 'inundated': 0, 'damage': 0, 'lava': 0, 'devastated': 0, 'forest fires': 0, 'outbreak': 0, 'terrorism': 0, 'panic': 0, 'detonation': 0, 'injured': 0, 'deluged': 0, 'windstorm': 0, 'thunderstorm': 0, 'hazard': 0, 'crushed': 0, 'crashed': 0, 'blood': 0, 'buildings on fire': 0, 'destruction': 0, 'deluge': 0, 'weapon': 0, 'sinkhole': 0, 'aftershock': 0, 'ambulance': 0, 'wreckage': 0, 'desolate': 0, 'blown up': 0, 'fatalities': 0, 'injuries': 0, 'bombing': 0, 'structural failure': 0, 'death': 0, 'police': 0, 'destroyed': 0, 'engulfed': 0, 'crash': 0, 'emergency': 0, 'inundation': 0, 'collide': 0, 'blight': 0, 'destroy': 0, 'dust storm': 0, 'mudslide': 0, 'displaced': 0, 'arsonist': 0, 'nuclear reactor': 0, 'blazing': 0, 'lightning': 0, 'explode': 0, 'tsunami': 0, 'burning buildings': 0, 'volcano': 0, 'hijack': 0, 'refugees': 0, 'derailment': 0, 'harm': 0, 'hail': 0, 'bioterror': 0, 'hurricane': 0, 'trauma': 0, 'evacuation': 0, 'cyclone': 0, 'epicentre': 0, 'nuclear disaster': 0, 'hostages': 0, 'obliteration': 0, 'suicide bomber': 0, 'drowning': 0, 'derailed': 0, 'threat': 0, 'apocalypse': 0, 'chemical emergency': 0, 'burning': 0, 'obliterated': 0, 'screamed': 0, 'fire truck': 0, 'seismic': 0, 'wildfire': 0, 'emergency services': 0, 'attack': 0, 'storm': 0, 'catastrophic': 0, 'twister': 0, 'evacuated': 0, 'natural disaster': 0, 'collapse': 0, 'trapped': 0, 'war zone': 0, 'exploded': 0, 'collapsed': 0, 'oil spill': 0, 'evacuate': 0, 'typhoon': 0, 'dead': 0, 'survived': 0, 'first responders': 0, 'keyword': 0, 'radiation emergency': 0, 'annihilation': 0, 'deaths': 0, 'rubble': 0, 'ablaze': 0, 'meltdown': 0, 'casualty': 0, 'body bags': 0, 'upheaval': 0, 'flood': 0, 'demolished': 0, 'rioting': 0, 'hellfire': 0, 'curfew': 0, 'hazardous': 0, 'tornado': 0, 'desolation': 0, 'flattened': 0, 'drought': 0, 'derail': 0, 'arson': 0, 'rescued': 0, 'suicide bombing': 0, 'wild fires': 0, 'wounded': 0}
# for issue in count.keys():
# print issue
def getCount(content):
示例5: MongoClient
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import mongoRDD [as 别名]
import pytz
import time
from operator import add
from pymongo import MongoClient
start_time = time.time()
client = MongoClient('localhost',27017)
utc=pytz.UTC
db = client['disaster']
threeHourlyAlert = db['minute']
pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.overAll10MinuteAverage').persist()
dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc)
incrementBy3Hour= datetime.timedelta(hours=2)
for x in range(288):
dayOneIncrementBy3Hour = dayOne + incrementBy3Hour
dayOneIncrementBy3Hour = dayOneIncrementBy3Hour.replace(tzinfo=utc)
output = rdd.filter( lambda x: x['date'] >= dayOne and x['date'] < dayOneIncrementBy3Hour ).flatMap(lambda x: x['average'].items()).filter(lambda (x,y): y > 8 ).map(lambda (x,y): (x,1)).reduceByKey(lambda x,y:x+y).filter(lambda (x,y): y>8).map(lambda(x,y): x).collect()
if output != []:
result = db.threeHourlyAlert.insert_one({"date": dayOne , "count":output})
dayOne = dayOneIncrementBy3Hour.replace(tzinfo=utc)
示例6: main
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import mongoRDD [as 别名]
def main():
conf = SparkConf().setAppName("pyspark read")
sc = SparkContext(conf=conf)
mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/estreaming.splash')
print(mongo_rdd.first())
示例7: SparkContext
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import mongoRDD [as 别名]
print doc["_id"]
if __name__ == '__main__':
pymongo_spark.activate()
start_time=datetime.time()
conf = (SparkConf()
.setAppName("LinkingPipeLine"))
sc = SparkContext(conf=conf,pyFiles=['/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/LinkPipeMethods.py',
'/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/tagText.py'])
#setting up RDD
rdd = sc.mongoRDD('mongodb://10.1.1.5:27017/GaugeDB.test_judgments')
#Filtering Criteria for RDD
filterRDD = rdd.filter(lambda x : True if x["pipefinal"] == 1 else False)
#Config paths
path1 = "/usr/linkPipModels/CRF-Model-OnlyCodes"
path2 = "/usr/linkPipModels/CRF-Model-OnlyTitles"
path3 = "/usr/linkPipModels/VectorSpaceTitles_word.p"
path4 = "/usr/linkPipModels/VectorSpaceCodes.p"
path5 = "/usr/linkPipModels/Tf-IdfOnlytitles.p"
path6 = "/usr/linkPipModels/Tf-IdfCitationCodes.p"
path7 = "/usr/linkPipModels/TitleClassifier.p"
path8 ="/usr/linkPipModels/JournalDictForStep1.p"
path9 ="/usr/linkPipModels/JournalDictForStep2.p"