本文整理汇总了Python中library.file_io.FileIO.writeToFileAsJson方法的典型用法代码示例。如果您正苦于以下问题:Python FileIO.writeToFileAsJson方法的具体用法?Python FileIO.writeToFileAsJson怎么用?Python FileIO.writeToFileAsJson使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类library.file_io.FileIO
的用法示例。
在下文中一共展示了FileIO.writeToFileAsJson方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: trendCurves
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def trendCurves(iterationData=None, experimentFileName=None):
if iterationData:
currentTimeStep, _, currentTopics, _, finalCall, conf = iterationData
experimentFileName = conf['experimentFileName']
if not finalCall:
topicDistribution = dict((str(topic.id), {'total': topic.totalCount, 'timeStep': topic.countDistribution[currentTimeStep]}) for topic in currentTopics)
# print currentTimeStep
FileIO.writeToFileAsJson({'t':currentTimeStep, 'topics':topicDistribution}, experimentFileName)
else:
iterationInfo = {'trending_topics': [topic.id for topic in currentTopics if topic.stickiness>=stickinessLowerThreshold],
'topic_colors': dict((str(topic.id), topic.color) for topic in currentTopics),
'conf': conf}
del iterationInfo['conf']['spamDectectionMethod']
FileIO.writeToFileAsJson(iterationInfo, experimentFileName)
else:
topicsDataX = defaultdict(list)
topicsDataY = defaultdict(list)
for data in FileIO.iterateJsonFromFile(experimentFileName):
if 'conf' not in data:
for topic in data['topics']: topicsDataX[topic].append(data['t']), topicsDataY[topic].append(data['topics'][topic]['timeStep'])
else: topicColorMap=data['topic_colors']; trendingTopics=data['trending_topics']
for topic in topicsDataX: plt.fill_between(topicsDataX[topic], topicsDataY[topic], color=topicColorMap[str(topic)], alpha=1.0)
plt.figure()
for topic in trendingTopics: plt.fill_between(topicsDataX[str(topic)], topicsDataY[str(topic)], color=topicColorMap[str(topic)], alpha=1.0)
plt.ylabel('Number of Contents', fontsize=16, fontweight='bold')
plt.show()
示例2: modifiedClusterAnalysisMethod
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
global evaluation, previousTime
currentTime = time.time()
documentClusters = [
cluster.documentsInCluster.keys()
for k, cluster in hdStreamClusteringObject.clusters.iteritems()
if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"]
]
iteration_data = evaluation.getEvaluationMetrics(
documentClusters,
currentTime - previousTime,
{
"type": experts_twitter_stream_settings["dimensions_performance_type"],
"dimensions": experts_twitter_stream_settings["dimensions"],
},
)
iteration_data["no_of_observed_dimensions"] = len(hdStreamClusteringObject.phraseTextToPhraseObjectMap)
previousTime = time.time()
FileIO.writeToFileAsJson(iteration_data, JustifyDimensionsEstimation.stats_file)
del iteration_data["clusters"]
print currentMessageTime, iteration_data
if experts_twitter_stream_settings["dimensions"] != 76819 and 2 * experts_twitter_stream_settings[
"dimensions"
] <= len(hdStreamClusteringObject.phraseTextToPhraseObjectMap):
raise Exception
示例3: generate_hashtag_specific_location_and_pure_influence_scores
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids):
for test_model_id in test_models_ids:
output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id)
GeneralMethods.runCommand('rm -rf %s'%output_file)
ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time()
for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\
enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time):
ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1]))
for location, ito_location_and_occurrence_time in
groupby(
sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)),
key=itemgetter(0)
)
]
print hashtag_count, test_model_id
ltuo_location_and_pure_influence_score = []
for location, location_occurrence_times in ltuo_location_and_occurrence_times:
pure_influence_scores = []
for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times:
if location!=neighbor_location:
pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times)
pure_influence_scores.append(pure_influence_score)
ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)])
ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1))
FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
示例4: generateRadiusSpots
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generateRadiusSpots(radiusInMiles):
graph = nx.Graph()
spotsFile = radiusSpotsFolder+'%s'%(radiusInMiles)
print 'Creating:', spotsFile
for lid in locationIterator():
for location in nearbyLocations(lid, radiusInMiles): graph.add_edge(location['_id'], lid)
for locations in nx.connected_components(graph): FileIO.writeToFileAsJson({'venues': locations}, spotsFile)
示例5: generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag):
for model_id in models_ids:
# if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
# else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag)
output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
GeneralMethods.runCommand('rm -rf %s'%output_file)
for line_count, location_object in enumerate(iterateJsonFromFile(
location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
)):
print line_count, model_id
tuo_neighbor_location_and_pure_influence_score = []
location_hashtag_set = set(location_object['hashtags'])
for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems():
pure_influence_scores = []
for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems():
if hashtag in location_object['hashtags']:
location_occurrences = location_object['hashtags'][hashtag][0]
pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences))
neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys())
if hashtag_tag==w_extra_hashtags_tag:
for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0)
for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0)
mean_pure_influence_score = np.mean(pure_influence_scores)
tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score])
tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1))
FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
示例6: mr_data_analysis
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences):
# output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
# output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
print PARAMS_DICT
# runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300})
runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300})
FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
示例7: generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder):
def location_similarity(location_vector_1, location_vector_2):
return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.)
influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]
for model_id in model_ids:
mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id))
GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)
for line_count, location_object in enumerate(iterateJsonFromFile(
location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
)):
print line_count
location = location_object['id']
tuo_neighbor_location_and_mf_influence_type_and_similarity = []
for neighbor_location in location_object['links'].keys():
mf_influence_type_and_similarity = {}
for influence_type in influence_types:
similarity = location_similarity(
mf_location_to_mf_influence_type_to_influence_vector[location][influence_type],
mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type]
)
mf_influence_type_and_similarity[influence_type] = similarity
so_hashtags_for_location = set(location_object['hashtags'].keys())
so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys())
numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0.
denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0.
mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator
tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity])
FileIO.writeToFileAsJson(
[location, tuo_neighbor_location_and_mf_influence_type_and_similarity],
tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id
)
示例8: measureRankingQuality
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def measureRankingQuality(iterationData=None, experimentFileName=None):
# def getTopTopics(model, noOfTopics):
# topics = set()
# topTopics = model.topTopics[:]
# while True:
# topicIndex = GeneralMethods.weightedChoice([i[1] for i in topTopics])
# topic = topTopics[topicIndex][0].id
# del topTopics[topicIndex]
# if topic not in topics: topics.add(topic)
# if len(topics)==noOfTopics or len(topics)==len(model.topTopics): break
# return [(t, 0) for t in topics]
if iterationData:
currentTimeStep, model, _, _, finalCall, conf = iterationData
if not finalCall:
rankingMethods = conf["rankingMethods"]
experimentFileName = conf["experimentFileName"]
topTopics = sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[
:10
]
# topTopics = getTopTopics(model, 10)
# topTopics = random.sample(sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[:10], min(len(model.topicsDistributionInTheTimeSet),5))
# topTopics = random.sample(model.topicsDistributionInTheTimeSet.items(), min(len(model.topicsDistributionInTheTimeSet),5))
iterationData = {"currentTimeStep": currentTimeStep, "spammmess": defaultdict(list)}
for rankingMethod in rankingMethods:
for queryTopic, _ in topTopics:
ranking_id, messages = rankingMethod(queryTopic, model.topicToMessagesMap, **conf)
# if spammness(messages, norm_k)==0:
# print 'c'
# print rankingMethod, spammness(messages, norm_k)
iterationData["spammmess"][ranking_id].append(spammness(messages, norm_k))
# print ranking_id, spammness(messages, norm_k)
FileIO.writeToFileAsJson(iterationData, experimentFileName)
model.topicsDistributionInTheTimeSet = defaultdict(int)
示例9: dimensionsEstimation
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def dimensionsEstimation(estimationObject, currentMessageTime):
'''
This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
the number of phrases that need to added every iteration for different dimensions.
The dimension at which the number of phrases added stablizes is the number of dimensions
for the stream.
Why do we need this?
The aim is to get dimensions, that dont change too often at the same time are not very huge.
This experiments gives us an approximate idea of the number of dimensions. Randomly picking
a small value will result in dimensions that are not good and picking too big a value will
result in inefficiency.
'''
def updatePhraseScore(phraseObject):
phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
return phraseObject
topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
if estimationObject.topDimensionsDuringPreviousIteration:
dimensions_estimation = {}
for boundary in estimationObject.boundaries:
if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
iterationData = {
'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
'settings': estimationObject.stream_settings.convertToSerializableObject(),
ParameterEstimation.dimensionsEstimationId:dimensions_estimation
}
FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
示例10: generateStatsForMRKMeansClusteringQuality
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generateStatsForMRKMeansClusteringQuality():
for i in [90000, 100000, 200000, 300000, 400000, 500000]:
print 'Generating stats for: ',i
tf = TweetsFile(i, **experts_twitter_stream_settings)
FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(),
'settings': Settings.getSerialzedObject(tf.stream_settings)},
TweetsFile.mr_stats_file)
示例11: generate
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generate(self):
i=0
for tweet in TwitterIterators.iterateTweetsFromExperts():
FileIO.writeToFileAsJson(tweet, self.fileName)
i+=1
if i==self.length: break;
os.system('gzip %s'%self.fileName)
示例12: dimensionsUpdateFrequencyEstimation
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
'''
Observe the new dimensions that get added to current dimension if the dimensions
are being updated at regular intervals.
For example, number of dimensions being added after 10m, 20m,... 5 horus.
As time increases the number of 'decayed' dimensions increase. The current dimensions
has a lot of unwanted decayed dimensions. Using this information identify the time
interval that is best suited to refresh dimensions.
Tentative: We decide to pick the time interval at which the rate of decay is maximum.
'''
def updatePhraseScore(phraseObject):
phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
return phraseObject
dimensions = estimationObject.stream_settings['dimensions']
newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
print currentMessageTime, len(newList)
if len(newList) >= dimensions:
idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
dimensionsUpdateFrequency = {}
for td, id in idsOfDimensionsListToCompare:
oldList = estimationObject.dimensionListsMap[id]
dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
iterationData = {
'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
'settings': pprint.pformat(estimationObject.stream_settings),
ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
}
FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
for key in estimationObject.dimensionListsMap.keys()[:]:
if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
示例13: generate_data_for_significant_nei_utm_ids
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generate_data_for_significant_nei_utm_ids():
output_file = GeneralMethods.get_method_id()+'.json'
so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {}
for utm_object in \
FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
utm_object['mf_nei_utm_id_to_common_h_count'].keys()
hashtags = sorted(list(so_hashtags))
mf_utm_id_to_vector = {}
for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
# print i, utm_object['utm_id']
utm_id_vector = map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
hashtags)
mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector)
for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()):
print '%s of %s'%(i+1, len(mf_utm_id_to_vector))
ltuo_utm_id_and_vector = [(utm_id, vector)]
for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]:
if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id:
ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id]))
od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
df_utm_vectors = robjects.DataFrame(od)
df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors)
dfm_dict = cjson.decode(df_utm_vectors_json)
mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames))
utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id]
dfm_dict['prediction_variable'] = utm_id_colname
dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname,
df_utm_vectors.colnames)
dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0]))
FileIO.writeToFileAsJson(dfm_dict, output_file)
示例14: analyzeQuality
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def analyzeQuality(graphs, graphType):
def getQualityScore(graphMap, edgesToKeep, timeDifference):
dataToReturn = []
for j, intervalInSeconds in enumerate([1]):
intervalInSeconds*=timeDifference
linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep)
logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep)
linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters)
print intervalInSeconds, edgesToKeep, score
dataToReturn.append(score)
return dataToReturn
graphFile = qualityMetricsFolder%graphType
print graphFile
GeneralMethods.runCommand('rm -rf %s'%graphFile)
for edgesToKeep in range(1,11):
# for edgesToKeep in [1,10]:
edgesToKeep*=0.1
graphMap = dict(graphs[:])
startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys())
timeDifference = endingGraphId-startingGraphId
LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep)
# print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}
FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
示例15: generateStatsForDefaultStreamSettings
# 需要导入模块: from library.file_io import FileIO [as 别名]
# 或者: from library.file_io.FileIO import writeToFileAsJson [as 别名]
def generateStatsForDefaultStreamSettings():
for i in [10**3, 10**4, 10**5]:
for j in range(1, 10):
print 'Generating stats for: ',i*j
tf = TweetsFile(i*j, **default_experts_twitter_stream_settings)
FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(),
'settings': Settings.getSerialzedObject(tf.stream_settings)},
TweetsFile.default_stats_file)