本文整理汇总了Python中readability.readability.Document类的典型用法代码示例。如果您正苦于以下问题:Python Document类的具体用法?Python Document怎么用?Python Document使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Document类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(index):
print "Index %d" % index
dirname = "data/%04d" % index
# url of english article
url = open(dirname + "/url_en.txt").read()
# download html
html = urllib.urlopen(url).read().decode('latin-1')
# apply readability
document = Document(html)
article = document.summary()
article = nltk.clean_html(article)
# replace latin characters
article = re.sub(u' ', u'\n', article)
article = re.sub(u'\x92', u'`', article)
article = re.sub(u'\x96', u'-', article)
# article_en.txt
output = codecs.open(dirname + "/article_en.txt", 'w', encoding='ascii', errors='ignore')
output.write(article)
output.close()
# title.txt
output = codecs.open(dirname + "/title.txt", 'w', encoding='ascii', errors='ignore')
output.write(document.title())
output.close()
示例2: recommend_by_url
def recommend_by_url(url):
parsed = urlparse(url)
doc = Document(requests.get(url).content)
content = html.fromstring(doc.content()).xpath('string()')
bigrams = make_bigrams(content)
vec_bow = dictionary.doc2bow(bigrams)
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]
#print sims
docs = sorted(list(enumerate(sims)), key=lambda item: -item[1])
results, seen = [], []
for doc, score in docs:
res = ARTICLES[doc]
if not 'url' in res or res['url'] in seen:
continue
seen.append(res['url'])
p = urlparse(res['url'])
if p.hostname.endswith(parsed.hostname):
continue
res['score'] = float(score)
if 'content' in res:
del res['content']
if 'html' in res:
del res['html']
if res['summary']:
res['summary'] = res['summary'].strip()
results.append(res)
if len(results) > 14:
break
return results
示例3: markdownify
def markdownify(url_list, **options):
articles = []
images = []
paragraph_links = options['paragraph_links']
wrap_text = options['wrap_text']
preamble = options['preamble']
for url in url_list:
req = urllib2.Request(url,None,{'Referer': url_list[0]})
html = urllib2.urlopen(req).read()
document = Document(html, url=url)
readable_title = document.short_title()
summary = document.summary()
summary_doc = build_doc(summary)
images.extend([a.get('src') for a in summary_doc.findall('.//img')])
articles.append(document.summary())
markdown_articles = []
for (article, url) in zip(articles, url_list):
h = html2text.HTML2Text(baseurl=url)
h.inline_links = False
h.links_each_paragraph = (paragraph_links and 1) or 0
h.body_width = (wrap_text and 78) or 0
markdown_articles.append(h.handle(article))
combined_article = u"\n\n----\n\n".join(markdown_articles)
if preamble:
combined_article = (u"Title: %s \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article
return combined_article.encode("utf-8")
示例4: getText
def getText():
dataList = []
for f in os.listdir('unsupervised\\documents'):
filePath = 'unsupervised\\documents\\' + f
#print filePath
fileName, fileExtension = os.path.splitext(filePath)
#print fileExtension
if fileExtension.lower() == '.docx':
print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
doc = docxDocument(filePath)
for p in doc.paragraphs:
dataList.append(p.text) #print p.text
#print "-------------------------------"
elif fileExtension.lower() == '.pdf':
print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
#TODO
elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
with codecs.open (filePath, errors='ignore') as myfile:
source = myfile.read()
article = Document(source).summary()
title = Document(source).title()
soup = BeautifulSoup(article, 'lxml')
final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
dataList.append(final)
#print '*** TITLE *** \n\"' + title + '\"\n'
#print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
else:
print '' # 'undectected document type'
print '' #"-------------------------------"
return dataList
示例5: extract_article
def extract_article(url):
r = requests.get(url)
# the the url exists, continue
if r.status_code == 200:
# extract and parse response url
url = parse_url(r.url)
# extract html
html = r.content.decode('utf-8', errors='ignore')
# run boilerpipe
# boilerpipe_extractor = Extractor(html=html)
# run readability
readability_extractor = Document(html)
html = readability_extractor.summary()
# return article data
return {
'title': readability_extractor.short_title(),
'html': html,
'content': strip_tags(html).encode('utf-8', errors='ignore'),
'url': url
}
# otherwise return an empty dict
else:
return {}
示例6: get_webpage_by_html
def get_webpage_by_html(url, html=None):
html = get_html_str(url, html)
summary_obj = predefined_site(url, html)
article = video_site(url)
if summary_obj is None:
doc = Document(html, url=url, debug=True, multipage=False)
summary_obj = doc.summary_with_metadata(enclose_with_html_tag=False)
title = summary_obj.short_title
if article is None:
article = summary_obj.html
from urllib.parse import urlparse
webpage = Webpage()
webpage.url = url
webpage.domain = urlparse(url).hostname
webpage.title = title
webpage.favicon = ""
webpage.top_image = None
webpage.excerpt = summary_obj.description
webpage.author = None
webpage.content = article
webpage.tags = get_suggest_tags(title, article, summary_obj.keywords)
webpage.movies = []
webpage.raw_html = html
webpage.publish_date = None
webpage.segmentation = get_segmentation(title, article)
return webpage.__dict__
示例7: extract_article
def extract_article(url):
r = requests.get(url)
# the the url exists, continue
if r.status_code == 200:
# extract and parse response url
url = parse_url(r.url)
# extract html
html = r.content.decode('utf-8', errors='ignore')
# run boilerpipe
BP = Extractor(html=html)
# run readability
Rdb = Document(html)
html = Rdb.summary()
# return article data
return {
'extracted_title': Rdb.short_title().strip(),
'extracted_content': strip_tags(BP.getText()),
}
# otherwise return an empty dict
else:
return {}
示例8: set
class Gist:
keyword_pattern = re.compile(r'^[^\d]+$')
stop_words = set(get_stop_words('en'))
def __init__(self, html):
self.html = html
self.document = Document(html)
@property
def title(self):
return self.document.short_title()
@cached_property
def text(self):
text = self.document.summary()
text = re.sub('<br[^>]+>', '\n', text)
text = re.sub('</?p[^>]+>', '\n\n', text)
text = re.sub('<[^>]+>', '', text)
text = re.sub('^[ \t]+$', '', text)
text = re.sub('\n{3,}', '\n\n', text, flags=re.MULTILINE)
return text
@staticmethod
def _common_prefix(one, two):
parallelity = [x == y for x, y in zip(one, two)] + [False]
return parallelity.index(False)
@classmethod
def _find_representative(cls, stem, text):
tokens = text.split()
prefixes = {token: cls._common_prefix(token, stem) for token in tokens}
best = lambda token: (-token[1], len(token[0]))
return sorted(prefixes.items(), key=best)[0][0]
@classmethod
def _is_good_keyword(cls, word):
return (word not in cls.stop_words) and \
cls.keyword_pattern.match(word)
@classmethod
def find_keywords(cls, text):
whoosh_backend = SearchForm().searchqueryset.query.backend
if not whoosh_backend.setup_complete:
whoosh_backend.setup()
with whoosh_backend.index.searcher() as searcher:
keywords = searcher.key_terms_from_text(
'text', text, numterms=10, normalize=False)
keywords = list(zip(*keywords))[0] if keywords else []
keywords = [cls._find_representative(keyword, text) for keyword in keywords]
keywords = [keyword for keyword in keywords if cls._is_good_keyword(keyword)]
#no double keywords in list
keywords = list(set(keywords))
#no punctuation in suggested keywords
keywords = [''.join(c for c in s if c not in string.punctuation) for s in keywords]
return keywords
@property
def keywords(self):
return self.find_keywords(self.text)
示例9: enrich
async def enrich(self, result):
if not self.soup:
return result
result.set('title', self.soup.title.string, 0, 'textlength')
if result.has('content'):
return result
parts = []
for txt in self.soup.find_all("noscript"):
if txt.string is not None:
parts.append(txt.string)
html = " ".join(parts).strip()
if not html:
html = self.soup.all_text()
try:
doc = Document(html, url=self.url)
content = doc.summary(html_partial=True)
result.set('content', sanitize_html(content))
# pylint: disable=bare-except
except:
pass
return result
示例10: __init__
class Article:
def __init__(self, url):
print('Saving page: {}'.format(url))
res = requests.get(url)
self.url = url
self.article = Document(res.content)
self._add_title()
self._save_images()
def _add_title(self):
self.root = etree.fromstring(self.article.summary())
body = self.root.find('body')
title = self.article.title()
ascii_title = unidecode(title) if type(title) == unicode else title
title_header = etree.HTML('<h2>{}</h2>'.format(ascii_title))
body.insert(0, title_header)
def _save_images(self):
tmppath = tempfile.mkdtemp()
images = self.root.xpath('//img')
for img in images:
imgsrc = img.get('src')
# handle scheme-agnostic URLs
if 'http' not in imgsrc and '//' in imgsrc:
imgsrc = 'http:{}'.format(imgsrc)
# handle relative file paths
elif 'http' not in imgsrc:
parsed = urlparse(self.url)
imgsrc = '{}://{}{}'.format(parsed.scheme, parsed.netloc, imgsrc)
filename = os.path.basename(imgsrc)
dest = os.path.join(tmppath, filename)
try:
res = requests.get(imgsrc)
except Exception as e:
print('Could not fetch image ({}) from "{}"'.format(str(e), imgsrc))
return
if res.status_code == 404:
print('Could not fetch image (HTTP 404), attempted fetch: "{}", source URL: {}'.format(imgsrc, img.get('src')))
continue
with open(dest, 'wb') as f:
f.write(res.content)
img.set('src', dest)
@property
def title(self):
return self.article.title()
@property
def html(self):
return etree.tostring(self.root)
示例11: get_announcement_body
def get_announcement_body(url):
now = datetime.datetime.now()
resp = ["","","","","",""]
images = []
html = br.open(url).read()
readable_announcement = Document(html).summary()
readable_title = Document(html).title()
soup = BeautifulSoup(readable_announcement, "lxml")
final_announcement = soup.text
links = soup.findAll('img', src=True)
for lin in links:
li = urlparse.urljoin(url,lin['src'])
images.append( li)
resp[0] = str(final_announcement.encode("ascii","ignore"))
resp[1] = str(readable_title.encode("ascii","ignore"))
resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
resp[3] = url
resp[4] = url
resp[5] = ""
#insertDB(resp)
#print "inserted resp"
title_article = []
title_article.append(final_announcement)
title_article.append(readable_title)
title_article.append(images)
return title_article
示例12: getTextFromHTML
def getTextFromHTML(self, url_id):
""" Runs Readability (Document) on the HTML text
"""
html_row = get_html(self.pg_conn, url_id)
if not html_row or 'html' not in html_row:
return False
if html_row['readabletext'] and html_row['readabletext'] != '':
return html_row['readabletext']
html = html_row['html']
try:
html_summary = Document(html).summary(html_partial=True)
html_summary = html_summary.replace('\n','').replace('\t','')
if len(html_summary) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
return False
raw_text = lxml.html.document_fromstring(html_summary).text_content()
except:
raw_text = False
if raw_text:
save_readabletext(self.pg_conn, url_id, raw_text, 'meta')
else:
save_readabletext(self.pg_conn, url_id, '', 'meta')
return raw_text
示例13: main
def main():
#print 'Hello there'
# Command line args are in sys.argv[1], sys.argv[2] ...
# sys.argv[0] is the script name itself and can be ignored
dataList = []
for f in os.listdir('documents'):
filePath = 'documents\\' + f
#print filePath
fileName, fileExtension = os.path.splitext(filePath)
#print fileExtension
if fileExtension.lower() == '.docx':
print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
doc = docxDocument(filePath)
for p in doc.paragraphs:
dataList.append(p.text) #print p.text
#print "-------------------------------"
elif fileExtension.lower() == '.pdf':
print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
# with open(filePath) as f:
# doc = slate.PDF(f)
# print doc[1]
# exit()
#TODO
elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
with codecs.open (filePath, errors='ignore') as myfile:
source = myfile.read()
article = Document(source).summary()
title = Document(source).title()
soup = BeautifulSoup(article, 'lxml')
final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
dataList.append(final)
#print '*** TITLE *** \n\"' + title + '\"\n'
#print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
else:
print '' # 'undectected document type'
print '' #"-------------------------------"
#print dataList
#for i in dataList:
# print i
cachedStopWords = stopwords.words("english")
combined = ' '.join(dataList)
#print combined
bloblist = [tb(combined)]
for i, blob in enumerate(bloblist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')}
#print scores
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#print sorted_words
for word, score in sorted_words:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
示例14: _getResponseText
def _getResponseText(self, response):
'''
(reponse) -> Text
Returns text within the body of an HttpResponse object.
'''
readability = Document(response.body)
content = readability.title() + readability.summary()
return content
示例15: main
def main():
html = urllib.urlopen("http://habrahabr.ru/post/150756/").read()
doc = Document(html)
short_title = doc.short_title()
readable_article = doc.summary()
f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb")
f.write(readable_article.encode("utf-8"))
f.close()