本文整理汇总了Python中sumy.parsers.plaintext.PlaintextParser类的典型用法代码示例。如果您正苦于以下问题:Python PlaintextParser类的具体用法?Python PlaintextParser怎么用?Python PlaintextParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PlaintextParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: summarize
def summarize(text, n_sentences, sep='\n'):
'''
Args:
text (str or file): text itself or file in memory of text
n_sentences (int): number of sentences to include in summary
Kwargs:
sep (str): separator to join summary sentences
Returns:
(str) n_sentences-long, automatically-produced summary of text
'''
if isinstance(text, str):
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
elif isinstance(text, file):
parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
else:
raise TypeError('text must be either str or file')
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
示例2: test_split_into_words
def test_split_into_words(self):
sentences1 = PlaintextParser.from_string("One, two two. Two. Three.",
Tokenizer("english")).document.sentences
self.assertEqual(["One", "two", "two", "Two", "Three"],
_split_into_words(sentences1))
sentences2 = PlaintextParser.from_string("two two. Two. Three.",
Tokenizer("english")).document.sentences
self.assertEqual(["two", "two", "Two", "Three"],
_split_into_words(sentences2))
示例3: _firstK_score
def _firstK_score(storyName, highlightName):
parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
geneSen = parser.document.sentences[:SENTENCES_COUNT]
refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences
# print geneSen
# print "=========="
# print refSen
# print evaluate(geneSen, refSen)
try:
return evaluate(geneSen, refSen)
except Exception as e:
print storyName
print e
raise e
示例4: test_get_word_ngrams
def test_get_word_ngrams(self):
sentences = PlaintextParser.from_string("This is a test.",
Tokenizer("english")).document.sentences
correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
found_ngrams = _get_word_ngrams(2, sentences)
for ngram in correct_ngrams:
self.assertTrue(ngram in found_ngrams)
示例5: summarize
def summarize(corpus, length, algorithm):
summarizer = None
summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)"
algorithm = algorithm.lower()
try:
parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE))
if algorithm == "textrank":
summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
elif algorithm == "lexrank":
summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
elif algorithm == "luhn":
summarizer = LuhnSummarizer(Stemmer(LANGUAGE))
elif algorithm == "edmundson":
summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE))
elif algorithm == "kl":
summarizer = KLSummarizer(Stemmer(LANGUAGE))
elif algorithm == "lsa":
summarizer = LsaSummarizer(Stemmer(LANGUAGE))
elif algorithm == "sumbasic":
summarizer = SumBasicSummarizer(Stemmer(LANGUAGE))
elif algorithm == "random":
summarizer = RandomSummarizer(Stemmer(LANGUAGE))
if summarizer:
summarizer.stop_words = get_stop_words(LANGUAGE)
summary = " ".join([obj._text for obj in summarizer(parser.document, length)])
return summary
except Exception as e:
return str(e)
示例6: kl_rank_sum
def kl_rank_sum(path, K):
filename = path
K = K
parser = PlaintextParser.from_file(filename, Tokenizer("english"))
summarizer = LexRankSummarizer()
summary = summarizer(parser.document, K) #number of sentences in parenthecies
return summary
示例7: get_summary
def get_summary(source_text, compression_factor):
"""
Given some input source_text, returns its summary based on the chosen
compression factor.
"""
summary = {
'source_text': source_text,
'compression_factor': compression_factor,
'summary': '',
'success': False
}
parser = PlaintextParser.from_string(source_text, Tokenizer("english"))
summ_algo = LexRankSummarizer()
final_line_num = \
int(source_text.count('.')/compression_factor)
try:
raw_summary = summ_algo(parser.document, final_line_num)
for sentence in raw_summary:
summary['summary'] += str(sentence) + ' '
except:
pass
summary['success'] = (len(summary['summary']) != 0)
return summary
示例8: summarize_with_info
def summarize_with_info(self, corpus, length, algorithm):
parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))
if algorithm == "textrank":
summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "lexrank":
summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "luhn":
summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "edmundson":
summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
summarizer.bonus_words = parser.significant_words
summarizer.stigma_words = parser.stigma_words
elif algorithm == "kl":
summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "lsa":
summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "sumbasic":
summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "random":
summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
else:
raise NotImplemented("Summary algorithm is not available")
summarizer.stop_words = get_stop_words(self.LANGUAGE)
return summarizer(parser.document, length)
示例9: summarizeFile
def summarizeFile(inputFile):
summarizer = LsaSummarizer(stem_word)
summarizer.stop_words = get_stop_words("english")
url = findURLS(inputFile)
if url != None:
if url[-1] == '.':
url = url[0:-1]
#print (url)
#urlContent = 'Summary from URL ['+url+']: \n'
urlContent = ''
try:
parser = HtmlParser.from_url(url, Tokenizer("english"))
for sentence in summarizer(parser.document, 3):
urlContent = urlContent + str(sentence) + '\n'
except:
#print (sys.exc_info()[0])
urlContent = ''
content = inputFile.read()
parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
#summarizer = LsaSummarizer(stem_word)
#summarizer.stop_words = get_stop_words(LANGUAGE)
#summary = 'Event Summary: \n'
summary = ''
try:
for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
summary = summary + str(sentence) + '\n'
except AssertionError:
return None
if url != None:
return summary + urlContent
return summary
示例10: summarize
def summarize(self, corpus, length, algorithm):
parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE))
if algorithm == "textrank":
summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "lexrank":
summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "luhn":
summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "edmundson":
summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "kl":
summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "lsa":
summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "sumbasic":
summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
elif algorithm == "random":
summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
else:
raise NotImplemented("Summary algorithm is not available")
summarizer.stop_words = get_stop_words(self.LANGUAGE)
summary = " ".join([obj._text for obj in summarizer(parser.document, length)])
return summary
示例11: summarize
def summarize(string, summary_length = 1, language = "english"):
string = string.lower() if string.isupper() else string
parser = PlaintextParser.from_string(string, Tokenizer(language))
stemmer = Stemmer(language)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(language)
return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)])
示例12: sumrise
def sumrise(text = text, sentences = 5):
if (validators.url(text)): text = web2text.getwebtxt(text)
parser = PlaintextParser.from_string(text, Tokenizer('english'))
summerizer = LsaSummarizer()
summary = str(summerizer(parser.document, sentences))
return summary
示例13: summarize
def summarize(self, extracted_refs, facet_results, max_length=250, mode='citance'):
'''
Summarizes the extracted references based on community detection
Args:
extracted_refs(list) -- results of the method.run (e.g. simple.py)
facet_results(dict) -- facets for each extracted reference
Look at data/task1b_results1.json
max_length(int) -- maximum length of the summary
mode(str) -- can be citance, reference
'''
citances = defaultdict(list)
summarizer = LexRankSummarizer(Stemmer('english'))
summary = defaultdict(lambda: defaultdict(list))
for t in extracted_refs:
citances[t[0]['topic']].append(
{'refs': t[0]['sentence'],
'citance': self.clean_citation(t[0]['citation_text'])})
for topic, citance in citances.iteritems():
# Create graph of citation similarities
vectorizer = TfidfVectorizer(
tokenizer=self.tokenize, min_df=1, max_df=len(citances) * .9)
cit_vectors = vectorizer.fit_transform(
[e['citance'] for e in citance]).toarray()
cit_text = {
i: v for i, v in enumerate(citance)}
cit_dict = {i: v for i, v in enumerate(cit_vectors)}
cits = []
for e in cit_dict: # vector (numpy array)
for e1 in cit_dict:
if e != e1:
simil = self.cossim(cit_dict[e],
cit_dict[e1])
if simil > 0.1:
cits.append((e, e1, simil))
G = nx.Graph()
G.add_weighted_edges_from(cits)
part = community.best_partition(G)
clusters = defaultdict(list)
tokenize = SentTokenizer(offsets=False)
for k, v in part.iteritems():
clusters[v].extend(tokenize(citance[k]['refs']))
# clusters includes ref sentences that belong in each cluster
# Find the most salient sentence in each cluster
sal_in_cluster = {} # salient sentences for each cluster
for i in clusters:
parser = PlaintextParser.from_string(
' '.join(clusters[i]).replace('\\', ''), Tokenizer('english'))
summ = summarizer(parser.document, 5)
# 5 is the number of sentences returned by LexRank
sal_in_cluster[i] = [unicode(s) for s in summ]
# The most salient sentences in each cluster
summary[topic.upper()] =\
self.pick_from_cluster(
sal_in_cluster, max_length, weighted=False)
return summary
示例14: summarize
def summarize(self, extracted_refs, facet_results, max_length=250):
'''
Summarizes the extracted references based on the facet results
Args:
extracted_refs(list) -- results of the method.run (e.g. simple.py)
facet_results(dict) -- facets for each extracted reference
Look at data/task1b_results1.json
max_length(int) -- maximum length of the summary
'''
summaries = defaultdict(lambda: defaultdict(list))
for t in extracted_refs:
topic = t[0]['topic']
citance = t[0]['citance_number']
if isinstance(t[0]['sentence'][0], list):
logger.warn('Unexpected, should check')
summaries[topic.upper()]\
[facet_results[topic.upper()]
[str(citance)]['SVM_LABEL']].append([t[0]['citation_text']])
summarizer = TextRankSummarizer(Stemmer('english'))
final_summ = defaultdict(lambda: defaultdict(dict))
ret_summ = defaultdict(list)
counts = defaultdict(lambda: defaultdict(dict))
for t in summaries:
for facet in summaries[t]:
if len(summaries[t][facet]) > 1:
summs = list(
itertools.chain.from_iterable(summaries[t][facet]))
parser = PlaintextParser.from_string(
' '.join(summs), Tokenizer('english'))
summ = summarizer(parser.document, max_length)
final_summ[t][facet] = [unicode(sent) for sent in summ]
counts[t][facet] = len(final_summ[t][facet])
else:
final_summ[t][facet] = self.s_t(summaries[t][facet][0])
i = 0
while self.w_t.count_words(ret_summ[t]) < max_length:
for fct in final_summ[t]:
if i < len(final_summ[t][fct]):
ret_summ[t].append(final_summ[t][fct][i])
i += 1
while self.w_t.count_words(ret_summ[t]) > max_length:
ret_summ[t].pop()
# summ = defaultdict(list)
# tokzer = WordTokenizer(stem=False)
# for k in final_summ:
# i = 0
# while tokzer.count_words(summ[k]) < max_length:
# for f in final_summ[k]:
# if len(final_summ[k][f]) > i and\
# tokzer.count_words(summ[k]) < max_length:
# summ[k].append(final_summ[k][f][i])
return ret_summ
示例15: summarize
def summarize(text):
total = ""
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
total += str(sentence)
return total