本文整理汇总了Python中readability.readability.Document.split方法的典型用法代码示例。如果您正苦于以下问题:Python Document.split方法的具体用法?Python Document.split怎么用?Python Document.split使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document
的用法示例。
在下文中一共展示了Document.split方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: download_html_as_text
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import split [as 别名]
def download_html_as_text(url, filename=None, format_to='rst'):
"""Download HTML content from url and convert it to plain text."""
# Construct internet connection
headers = {'User-Agent' : 'Mozilla Firefox for Ubuntu canonical - 1.0'}
req = urllib2.Request(url, headers=headers)
con = urllib2.urlopen(req)
html = con.read()
# Fetch and convert main contents
article = Document(html).summary()
if len(article) < 1024:
article = html
article = patch_image_alt(article)
title = Document(html).short_title()
text = pypandoc.convert(article, format_to, format='html')
title_utf8 = title.encode('utf-8')
lines_insert = [u'\n\n',
u'='*len(title_utf8), u'\n',
title_utf8, u'\n',
u'='*len(title_utf8), u'\n\n',
u':URL: ' + url, u'\n\n']
title = title.split('|,-')[0]
# Search for urls of images
imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)'
imgurl_re = re.compile(imgurl_pattern, re.I)
image_urls = imgurl_re.findall(text)
if filename is None:
filename = title.split('-')[0].strip().replace(' ', '-')
txtfile = open(filename + '-bak.' + format_to, 'w')
txtfile.writelines(lines_insert)
txtfile.write(text.encode('utf-8'))
txtfile.close()
# Replace online image URLs with local paths.
images = download_images(image_urls, filename + '-images')
for img, link in images:
text = text.replace(link, img)
txtfile = open(filename + '.' + format_to, 'w')
txtfile.writelines(lines_insert)
txtfile.write(text.encode('utf-8'))
txtfile.close()
示例2: _filter
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import split [as 别名]
def _filter(self, html):
unicode_html = UnicodeDammit(html, is_html=True).unicode_markup
text = Document(unicode_html).summary()
soup = BeautifulSoup(text, 'lxml')
text = clean_soup(soup).get_text()
if self.min_len > 0:
paragraphs = text.split('\n')
paragraphs = [par for par in map(lambda x: clean_spaces(x), paragraphs)
if len(par) > self.min_len]
return self.delimiter.join(paragraphs)
else:
return clean_spaces(text)
示例3: process_item
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import split [as 别名]
def process_item(self,item,spider):
if self.notThisPipeline(spider):
return item
hxs = HtmlXPathSelector(text=item["raw"])
image=hxs.select("//*[contains(@id, 'cardpic0')]//a//img/@src").extract()
if len(image)==0:
image=""
else:
image=image[0]
#image_local=image_path+image[0][-20:]
#f=open(image_local,'w')
#data=ul.urlopen(image).read()
#f.write(data)
item['image']=image
article= Document(item['raw']).summary()
item['article']= html2text.html2text(article)
title=Document(item['raw']).short_title()
title=title.split('_')
item['title']=title[0]
return item
示例4: parser_content
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import split [as 别名]
def parser_content(url):
rt_result = []
dr = re.compile(r'<[^>]+>',re.S)
html = urllib.urlopen(url).read()
readable_article = Document(html).summary().encode('utf8')
#print readable_article
readable_article = readable_article.replace(' ','')
cur_list = readable_article.split('\n')
for item in cur_list:
if '<img' in item and 'src=' in item:
#print item.split('src=')[1].split('"')[1]
dom = soupparser.fromstring(item)
if len(dom) > 0:
img_path = dom[0].xpath('.//img')
for img in img_path:
rt_result.append(['0',img.get('src')])
else:
use_item = dr.sub('',item).replace(' ','')
if len(use_item) > 10:
rt_result.append(['1',use_item])
return rt_result
示例5: title
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import split [as 别名]
words_edited.extend(["BBC", "England", "Britain", "2012", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
s=set(words_edited)
# Call and parse the BBC News feed into a dict.
bbc = feedparser.parse("http://feeds.bbci.co.uk/news/uk/rss.xml")
entries_parsed = []
# Go through each entry, use readability module to extract the article title (not the same as the feed unfortunately) and use NLTK to parse it for uncommon words & names. Create an array of these keywords.
for entry in bbc.entries:
entry_parsed = []
html = urllib.urlopen(entry.link).read()
article_title = Document(html).short_title()
if ("404" not in article_title):
entry_parsed.append(article_title)
nltk_title = filter(lambda w: not w.lower() in s,article_title.split())
processed_title = []
for word in nltk_title:
try:
encoded_word = word.encode()
processed_word = encoded_word.translate(None, string.punctuation)
if len(processed_word) > 1:
processed_title.append(processed_word)
except Exception,e:
print str(e)
entry_parsed.append(processed_title)
dt = datetime.fromtimestamp(mktime(entry.published_parsed))
entry_parsed.append(dt)
entry_parsed.append(entry.link.encode())
entries_parsed.append(entry_parsed)
示例6: Summarize
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import split [as 别名]
class Summarize(object):
"""docstring for summarize"""
def __init__(self):
self.freq = {}
self.sentences = []
self.data = ''
self.maxRec = 500
def checkSentence(self,s,x):
if len(s)>50:
return False
for word in self.freq[:x]:
if not (word[0] in s):
return False
return True
def summarize(self,url):
self.data = urllib2.urlopen(url).read()
self.data = Document(self.data).summary()
self.data = MLStripper.strip_tags(self.data).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
self.data = self.data.lower()
temp = self.data.split('.')
text = re.findall(r'([a-z]+|\d+)+', self.data)
for t in temp:
self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]
self.freq = {}
for word in text:
if word in self.freq:
self.freq[word] += 1
else:
self.freq[word] = 1
self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
self.freq.reverse()
t = lxml.html.parse(url)
title = t.find(".//title").text
return {'title': title, 'summary': self.evaluate(0.01)}
def summarizeText(self, text):
self.data = MLStripper.strip_tags(text).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
self.data = self.data.lower()
temp = self.data.split('.')
for t in temp:
self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]
self.freq = {}
for word in text:
if word in self.freq:
self.freq[word] += 1
else:
self.freq[word] = 1
self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
self.freq.reverse()
return self.evaluate(0.01)
def evaluate(self,d):
self.maxRec -= 1
output = ''
num = len(self.freq)
num = int(math.floor(num*d))
for sentence in self.sentences:
s = re.findall(r'[a-z]+', sentence)
if self.checkSentence(s, num) == True and len(sentence) > 2:
output += sentence[0].upper()+sentence[1:]+'. '
if len(self.data)>0:
compression = 1-(len(output)/len(self.data))
if self.maxRec >0:
if compression >= 0.80:
return self.evaluate(d-.001)
if compression <= 0.60:
return self.evaluate(d+.001)
return output
#s = Summarize()
#print s.summarize('http://www.bbc.co.uk/news/uk-25996176')