本文整理汇总了Python中pyspark.sql.functions.desc方法的典型用法代码示例。如果您正苦于以下问题:Python functions.desc方法的具体用法?Python functions.desc怎么用?Python functions.desc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.desc方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getTopEntities
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import desc [as 别名]
def getTopEntities(self, e, targetType = '', maxCount = 20, minScore = 0.0):
df1 = self.df
row1 = df1.where(df1.EntityId == e).first()
self.raiseErrorIfNotFound(row1, e)
if targetType == '':
df2 = df1.where(df1.EntityId != e)
else :
df2 = df1.where((df1.EntityId != e) & (df1.EntityType == targetType))
df3 = df2.select(df2.EntityId, df2.EntityType, udfCosineSimilarity(F.lit(row1.Data), df2.Data).alias('Score'))
return df3.where(df3.Score >= minScore).orderBy(df3.Score.desc()).limit(maxCount)
# COMMAND ----------
# MAGIC %md **PaperSimilarity** class to compute paper recommendations
# COMMAND ----------
# Parameters:
# resource: resource stream path
# container: container name in Azure Storage (AS) account
# account: Azure Storage (AS) account
# sas: complete 'Blob service SAS URL' of the shared access signature (sas) for the container
# key: access key for the container, if sas is specified, key is ignored
#
# Note:
# resource does not have header
# you need to provide value for either sas or key
#
示例2: _getTopEntitiesByEmbedding
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import desc [as 别名]
def _getTopEntitiesByEmbedding(self, e, maxCount, minScore):
df1 = self.df
paperdf = self.mag.getDataframe('Papers')
row1 = df1.where(df1.EntityId == e).first()
df2 = df1.where(df1.EntityId != e)
df3 = df2.select(df2.EntityId, udfCosineSimilarityN(F.lit(row1.Data), df2.Data).alias('Score'))
return df3.join(paperdf, df3.EntityId == paperdf.PaperId, 'inner').select(paperdf.PaperId, paperdf.PaperTitle, df3.Score).where((~F.isnan(df3.Score)) & (df3.Score >= minScore)).orderBy(df3.Score.desc()).limit(maxCount)
示例3: _getTopEntitiesByCocitation
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import desc [as 别名]
def _getTopEntitiesByCocitation(self, e, maxCount, minScore):
df1 = self.corefdf
paperdf = self.mag.getDataframe('Papers')
df2 = df1.where(df1.ReferenceId == e)
return df2.join(paperdf, df2.CoReferenceId == paperdf.PaperId, 'inner').select(paperdf.PaperId, paperdf.PaperTitle, df2.Score).where(df2.Score >= minScore).orderBy(df2.Score.desc()).limit(maxCount)
示例4: _compute_cocitations
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import desc [as 别名]
def _compute_cocitations(self, coreferenceLimit=50):
pr1 = self.mag.getDataframe('PaperReferences')
pr1 = pr1.selectExpr("PaperId as PaperId1", "PaperReferenceId as PaperReferenceId1" )
pr2 = self.mag.getDataframe('PaperReferences')
pr2 = pr2.selectExpr("PaperId as PaperId2", "PaperReferenceId as PaperReferenceId2" )
return pr1.join(pr2, pr1.PaperId1 == pr2.PaperId2, 'inner').filter(pr1.PaperReferenceId1 < pr2.PaperReferenceId2).select(pr1.PaperReferenceId1.alias('ReferenceId'), pr2.PaperReferenceId2.alias('CoReferenceId')).groupBy('ReferenceId', 'CoReferenceId').count().orderBy(F.desc('count')).selectExpr("ReferenceId as ReferenceId", "CoReferenceId as CoReferenceId", "count as Score").limit(coreferenceLimit)
# COMMAND ----------
示例5: process_file
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import desc [as 别名]
def process_file(date_update):
"""Process downloaded MEDLINE folder to parquet file"""
print("Process MEDLINE file to parquet")
# remove if folder still exist
if glob(os.path.join(save_dir, 'medline_*.parquet')):
subprocess.call(['rm', '-rf', 'medline_*.parquet'])
date_update_str = date_update.strftime("%Y_%m_%d")
path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
parse_results_rdd = path_rdd.\
flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
for publication_dict in pp.parse_medline_xml(x)])
medline_df = parse_results_rdd.toDF()
medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
mode='overwrite')
window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
windowed_df = medline_df.select(
max('delete').over(window).alias('is_deleted'),
rank().over(window).alias('pos'),
'*')
windowed_df.\
where('is_deleted = False and pos = 1').\
write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
mode='overwrite')
# parse grant database
parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
.filter(lambda x: x is not None)\
.map(lambda x: Row(**x))
grant_df = parse_grant_rdd.toDF()
grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
mode='overwrite')
示例6: _get_fliers
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import desc [as 别名]
def _get_fliers(colname, outliers):
# Filters only the outliers, should "showfliers" be True
fliers_df = outliers.filter("__{}_outlier".format(colname))
# If shows fliers, takes the top 1k with highest absolute values
fliers = (
fliers_df.select(F.abs(F.col("`{}`".format(colname))).alias(colname))
.orderBy(F.desc("`{}`".format(colname)))
.limit(1001)
.toPandas()[colname]
.values
)
return fliers