本文整理汇总了Python中sumy.parsers.plaintext.PlaintextParser.from_file方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextParser.from_file方法的具体用法?Python PlaintextParser.from_file怎么用?Python PlaintextParser.from_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sumy.parsers.plaintext.PlaintextParser
的用法示例。
在下文中一共展示了PlaintextParser.from_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _firstK_score
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def _firstK_score(storyName, highlightName):
parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
geneSen = parser.document.sentences[:SENTENCES_COUNT]
refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences
# print geneSen
# print "=========="
# print refSen
# print evaluate(geneSen, refSen)
try:
return evaluate(geneSen, refSen)
except Exception as e:
print storyName
print e
raise e
示例2: summarize
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def summarize(text, n_sentences, sep='\n'):
'''
Args:
text (str or file): text itself or file in memory of text
n_sentences (int): number of sentences to include in summary
Kwargs:
sep (str): separator to join summary sentences
Returns:
(str) n_sentences-long, automatically-produced summary of text
'''
if isinstance(text, str):
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
elif isinstance(text, file):
parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
else:
raise TypeError('text must be either str or file')
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
示例3: kl_rank_sum
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def kl_rank_sum(path, K):
filename = path
K = K
parser = PlaintextParser.from_file(filename, Tokenizer("english"))
summarizer = LexRankSummarizer()
summary = summarizer(parser.document, K) #number of sentences in parenthecies
return summary
示例4: lex_rank_sum
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def lex_rank_sum(path, L):
filename = path
L = L
output = []
parser = PlaintextParser.from_file(filename, Tokenizer("english"))
summarizer = LexRankSummarizer()
summary = summarizer(parser.document, L) #number of sentences in parenthecies
for sentence in summary: # option for writing to a summary output file.
item = str(sentence)
output.append(item)
return output
示例5: _summ_score
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def _summ_score(storyName, highlightName):
parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
geneSen = summarizer(parser.document, SENTENCES_COUNT)
refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences
#print geneSen
#print "=========="
#print refSen
try:
return evaluate(geneSen, refSen)
except Exception as e:
print storyName
print e
raise e
示例6: textrankReferenceSummary
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def textrankReferenceSummary(path):
sentencesList=[]
parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
#print(sentence._text)
sentencesList.append(sentence._text)
return sentencesList
示例7: _score
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def _score(storyName, highlightName):
geneSen = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE)).document.sentences
refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences
print "=============="
for sen in refSen:
print sen
for gs in geneSen:
r1 = []
print gs
for rs in refSen:
r1.append(rouge_n([gs], [rs], 1))
print r1
# print geneSen[0]
# print refSen[0], refSen[1]
# try:
# print rouge_n([geneSen[0]], [refSen[0]], 1)
# print rouge_n([geneSen[0]], [refSen[0]], 2)
# print rouge_n([geneSen[0]], [refSen[1]], 1)
# print rouge_n([geneSen[0]], [refSen[1]], 2)
# except ZeroDivisionError:
# pass
raw_input()
示例8: summarize_file
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def summarize_file(file_name):
#url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
#parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
# or for plain text files
parser = PlaintextParser.from_file(file_name, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
sentences = summarizer(parser.document, SENTENCES_COUNT)
list_sentences = []
for sentence in sentences:
list_sentences.append(str(sentence))
return list_sentences
示例9: get_smry
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def get_smry(self, input):
smry_list = {}
LANGUAGE = "english"
SENTENCES_COUNT = 10
parser = PlaintextParser.from_file(input, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
i = 0
for sentence in summarizer(parser.document, SENTENCES_COUNT):
print(sentence)
smry_list[str(i)] = str(sentence)
i = i + 1
return smry_list
示例10: extract_summary_keywords
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def extract_summary_keywords(trend,urls,titles):
total_articles_content=extract_text(urls)
keywords=extract_keywords_from_all_text(total_articles_content,titles)
current_path=os.path.dirname(os.path.realpath(__file__))
current_path=current_path+'\\'+trend+'.txt'
with open(current_path, 'w') as the_file:
the_file.write(total_articles_content)
parser = PlaintextParser.from_file(current_path, Tokenizer(LANGUAGE))
os.remove(current_path)
sentences=''
for sentence in summarizer(parser.document, 12):
sentences=sentences+' '+str(sentence)
replaced_syn=replacesynonym(sentences)
matches = tool.check(sentences)
correct_summary=language_check.correct(sentences, matches)
return correct_summary,keywords
示例11: createSummary
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def createSummary(self, input_file):
parser = PlaintextParser.from_file(
input_file, Tokenizer(self.__language))
self.__sumySummarizer.stop_words = get_stop_words(self.__language)
all_sentences = []
for paragraph in parser.document.paragraphs:
for sentence in paragraph.sentences:
all_sentences.append(str(sentence))
N = 5
top_ranked_sentences = []
for sentence in self.__sumySummarizer(parser.document, N):
top_ranked_sentences.append(str(sentence))
self.__summary = top_ranked_sentences
for sentence in self.__sumySummarizer(parser.document, 1):
self.__top = str(sentence)
示例12: use_sumy
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def use_sumy(input, SENTENCES_COUNT, method, parser_option):
"""Code to run sumy
# Supported summarization methods:
# Luhn - heurestic method, reference
# Edmundson heurestic method with previous statistic research, reference
# Latent Semantic Analysis, LSA - one of the algorithm from http://scholar.google.com/citations?user=0fTuW_YAAAAJ&hl=en I think the author is using more advanced algorithms now. Steinberger, J. a Ježek, K. Using latent semantic an and summary evaluation. In In Proceedings ISIM '04. 2004. S. 93-100.
# LexRank - Unsupervised approach inspired by algorithms PageRank and HITS, reference
# TextRank - some sort of combination of a few resources that I found on the internet. I really don't remember the sources. Probably Wikipedia and some papers in 1st page of Google :)"""
LANGUAGE = "english"
#parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
if parser_option == 'file':
parser = PlaintextParser.from_file(input, Tokenizer(LANGUAGE))
elif parser_option == 'string':
parser = PlaintextParser.from_string(input, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
summary = []
for sentence in summarizer(parser.document, SENTENCES_COUNT):
summary.append(sentence)
return summary
示例13: create_summary
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
def create_summary(algorithm, input_file, output_file = "sumy_summary.txt"):
# Set language
LANGUAGE = "english"
# Get top N ranked sentences
N = 5
stemmer = Stemmer(LANGUAGE)
parser = PlaintextParser.from_file(input_file, Tokenizer(LANGUAGE))
summarizer = create_summarizer(algorithm, stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
all_sentences = []
# Separate the paragraph into sentences
for paragraph in parser.document.paragraphs:
for sentence in paragraph.sentences:
all_sentences.append(str(sentence))
top_ranked_sentences = []
# Use the summarizer to get the top ranked sentences
for sentence in summarizer(parser.document, N):
top_ranked_sentences.append(str(sentence))
# Find the top ranked sentence
for sentence in summarizer(parser.document, 1):
top_sentence = str(sentence)
# Find the position (between 0 to 4) of the top ranked sentence
position = top_ranked_sentences.index(top_sentence)
# Save the sentences into an output file
# np.savetxt(output_file, top_ranked_sentences)
record = open(output_file, "w")
for i in range(len(top_ranked_sentences)):
record.write(top_ranked_sentences[i]+ 'XXXXXX')
record.write(str(position)+ 'XXXXXX')
record.close()
示例14: summarizer
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
sys.setdefaultencoding('utf8')
"""
nltk.data.path.append('/home/kariminf/Data/NLTK/')
for sentence in summarizer(parser.document, SENTENCES_COUNT):
print(sentence)
"""
file = open(SIZE_FILE, 'r')
while 1:
line = file.readline()
if line == '':
break;
parts = line.split(",")
sizes[parts[0]] = int(parts[1])
file.close()
nltk.data.path.append('/home/kariminf/Data/NLTK/')
for eval in sizes:
txt_path = "src/body/text/en/" + eval
parser = PlaintextParser.from_file(txt_path, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
summary = extract(summarizer, sizes[eval])
fout = open("baselines/EdmundsonSummarizer/en/" + eval[:-9] + ".txt", "w")
fout.write(summary)
fout.close()
示例15: len
# 需要导入模块: from sumy.parsers.plaintext import PlaintextParser [as 别名]
# 或者: from sumy.parsers.plaintext.PlaintextParser import from_file [as 别名]
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer #We're choosing Lexrank, other algorithms are also built in
from sumy.summarizers.lsa import LsaSummarizer
from unidecode import unidecode
from wikisum.wikisum import Crawler
import RAKE
import sys
if len(sys.argv) != 3:
raise StandardError("usage: python summarize.py filename.txt num_sentences")
crawl = Crawler()
file_name = sys.argv[1] #name of the plain-text file
num_sentences = int(sys.argv[2])
parser = PlaintextParser.from_file(file_name, Tokenizer("english"))
wordCount = 0
for paragraph in parser.document.paragraphs:
for sentence in paragraph.sentences:
for word in sentence.words:
wordCount += 1
results = {"LsaSummary":"", "LexRankSummary":""};
# LSA SUMMARY
summarizer = LsaSummarizer()
summary = summarizer(parser.document, num_sentences)
for sentence in summary: