本文整理汇总了Python中readability.readability.Document.lower方法的典型用法代码示例。如果您正苦于以下问题:Python Document.lower方法的具体用法?Python Document.lower怎么用?Python Document.lower使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document
的用法示例。
在下文中一共展示了Document.lower方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: processHtml
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import lower [as 别名]
def processHtml(self, html):
# split into header & body fields, ignore the header of the reply
html_body = html.split("\r\n\r\n")[1]
# html_body = html_body.decode('gbk', 'ignore').encode('utf-8')
# convert html to readable format
article = Document(html_body).summary()
# record title to judge relevance
title = Document(html_body).title()
# parse content of the web page
soup = BeautifulSoup(article, "html.parser")
text = soup.getText()
# print "===TITLE===" + title + "\n===CONTENT===" + text
# ignore small document
if len(text) < WebCrawler.PAGE_LENGTH_LIMIT:
return False
# convert all words to lower case & tokenize the web page content
tokens = nltk.word_tokenize(text.lower())
# remove all stop words for accuracy
filtered = [w for w in tokens if w not in stopwords.words('english')]
# check if the title contains keywords
is_title_relevant = False
for word in self.brandList:
if word in title.lower():
is_title_relevant = True
break
if not is_title_relevant:
return False
'''
tunable values
'''
# compute document relevance
score = 0
for word in filtered:
if word in WebCrawler.brandList:
score += 50
if word in WebCrawler.keywordList:
score += 20
print("Score: " + str(score))
# assign weight to document based on score
if score < 100:
return False
elif score < 200:
weight = 100
elif score < 400:
weight = 200
elif score < 800:
weight = 300
elif score < 1600:
weight = 400
else:
weight = 500
print("Page Relevance: " + str(weight))
'''
end
'''
# number of times(percentage) a brand appear in the relevant web page, format: <brand name> : percentage
brand_dict = {}
total_count = 0
# compute brand % over all brand counts
for brand in WebCrawler.brandList:
brand_count = 0
for word in filtered:
if brand in word:
brand_count += 1
brand_dict[brand] = brand_count
total_count += brand_count
# convert occurrence to % format
for brand, count in brand_dict.items():
brand_dict[brand] = float(count)/total_count
# compute contribution of each brand to the country domain
for brand, percentage in brand_dict.items():
if brand in self.brand_score:
self.brand_score[brand] += weight * percentage
else:
self.brand_score[brand] = weight * percentage
return True
示例2: Summarize
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import lower [as 别名]
class Summarize(object):
"""docstring for summarize"""
def __init__(self):
self.freq = {}
self.sentences = []
self.data = ''
self.maxRec = 500
def checkSentence(self,s,x):
if len(s)>50:
return False
for word in self.freq[:x]:
if not (word[0] in s):
return False
return True
def summarize(self,url):
self.data = urllib2.urlopen(url).read()
self.data = Document(self.data).summary()
self.data = MLStripper.strip_tags(self.data).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
self.data = self.data.lower()
temp = self.data.split('.')
text = re.findall(r'([a-z]+|\d+)+', self.data)
for t in temp:
self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]
self.freq = {}
for word in text:
if word in self.freq:
self.freq[word] += 1
else:
self.freq[word] = 1
self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
self.freq.reverse()
t = lxml.html.parse(url)
title = t.find(".//title").text
return {'title': title, 'summary': self.evaluate(0.01)}
def summarizeText(self, text):
self.data = MLStripper.strip_tags(text).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
self.data = self.data.lower()
temp = self.data.split('.')
for t in temp:
self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]
self.freq = {}
for word in text:
if word in self.freq:
self.freq[word] += 1
else:
self.freq[word] = 1
self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
self.freq.reverse()
return self.evaluate(0.01)
def evaluate(self,d):
self.maxRec -= 1
output = ''
num = len(self.freq)
num = int(math.floor(num*d))
for sentence in self.sentences:
s = re.findall(r'[a-z]+', sentence)
if self.checkSentence(s, num) == True and len(sentence) > 2:
output += sentence[0].upper()+sentence[1:]+'. '
if len(self.data)>0:
compression = 1-(len(output)/len(self.data))
if self.maxRec >0:
if compression >= 0.80:
return self.evaluate(d-.001)
if compression <= 0.60:
return self.evaluate(d+.001)
return output
#s = Summarize()
#print s.summarize('http://www.bbc.co.uk/news/uk-25996176')