本文整理匯總了Python中readability.Document.options['debug']方法的典型用法代碼示例。如果您正苦於以下問題:Python Document.options['debug']方法的具體用法?Python Document.options['debug']怎麽用?Python Document.options['debug']使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類readability.Document
的用法示例。
在下文中一共展示了Document.options['debug']方法的1個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: parseitem
# 需要導入模塊: from readability import Document [as 別名]
# 或者: from readability.Document import options['debug'] [as 別名]
def parseitem(self,response):
':type response: Response'
if 'Please turn on JavaScript' in response.body:
body = response.body
body = re.sub('<p class="caption"[^<]+', '', body)
body = re.sub('<noscript>(.|\r|\n)*?</noscript>','',body)
response = response.replace(body=body)
sel = Selector(response)
item = NewsscraperItem()
### storing the name of URL and source in item dictionary
item['url']= response.url
item['source']= self.name
### extracting the time of scraping of data inside the item
item['dateScraped']= strftime("%Y-%m-%d %H:%M:%S", gmtime())
### checking for url category defined in allowed domain. If response.url contain the string then move in otherwise in else condition
try:
if 'www.bbc.co.uk' in response.url:
### extracting title
title = sel.xpath("//h1[starts-with(@class,'story')]/text()").extract()
if(title):
### extracting title from the page and checking different xpath for searching the title
item['title']=title[0].strip()
### extracting date from the page using xpath
d = sel.xpath("//span[@class='date']/text()").extract()[0].strip()
###string to datetime conversion
f = strptime(d,'%d %B %Y')
###formating date in a particular format defined in config file
item['date']= strftime(Config['dateformat'],f)
### extracting content from the page
x = sel.xpath("(//div[@class='story-body']//*[self::p or self::strong]/text()) |(//span[@class='cross-head']/text())|(//div[@class='story-body']/p/a/text())").extract()
if len(x) > 1:
st="\n"
p = st.join(x)
### using regular expression to remove continuous white spaces from the content and replace by single space
item['content']= re.sub(r"[ \t\n]+", " ",p)
else:
# Not able to extract article content using xpath. Move to backup approach and use readability
try:
html = sel.xpath("//div[@class = 'story-body']").extract()
doc = Document(html)
doc.options['debug'] = False
try:
logging.basicConfig(level=logging.CRITICAL)
htmlContent = doc.summary()
content = html2texthandler(htmlContent)
except Exception, e:
pass
finally:
logging.basicConfig(level=logging.INFO)
item['content']= re.sub(r"[ \t\n\"]", " ",content)
except:
return