当前位置: 首页>>代码示例>>Python>>正文


Python Document.lower方法代码示例

本文整理汇总了Python中readability.readability.Document.lower方法的典型用法代码示例。如果您正苦于以下问题:Python Document.lower方法的具体用法?Python Document.lower怎么用?Python Document.lower使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在readability.readability.Document的用法示例。


在下文中一共展示了Document.lower方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: processHtml

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import lower [as 别名]
    def processHtml(self, html):
        # split into header & body fields, ignore the header of the reply
        html_body = html.split("\r\n\r\n")[1]
        # html_body = html_body.decode('gbk', 'ignore').encode('utf-8')
        # convert html to readable format
        article = Document(html_body).summary()
        # record title to judge relevance
        title = Document(html_body).title()
        # parse content of the web page
        soup = BeautifulSoup(article, "html.parser")
        text = soup.getText()
        # print "===TITLE===" + title + "\n===CONTENT===" + text

        # ignore small document
        if len(text) < WebCrawler.PAGE_LENGTH_LIMIT:
            return False

        # convert all words to lower case & tokenize the web page content
        tokens = nltk.word_tokenize(text.lower())
        # remove all stop words for accuracy
        filtered = [w for w in tokens if w not in stopwords.words('english')]

        # check if the title contains keywords
        is_title_relevant = False
        for word in self.brandList:
            if word in title.lower():
                is_title_relevant = True
                break

        if not is_title_relevant:
            return False

        '''
        tunable values
        '''
        # compute document relevance
        score = 0
        for word in filtered:
            if word in WebCrawler.brandList:
                score += 50
            if word in WebCrawler.keywordList:
                score += 20
        print("Score: " + str(score))

        # assign weight to document based on score
        if score < 100:
            return False
        elif score < 200:
            weight = 100
        elif score < 400:
            weight = 200
        elif score < 800:
            weight = 300
        elif score < 1600:
            weight = 400
        else:
            weight = 500
        print("Page Relevance: " + str(weight))
        '''
        end
        '''
        # number of times(percentage) a brand appear in the relevant web page, format: <brand name> : percentage
        brand_dict = {}
        total_count = 0
        # compute brand % over all brand counts
        for brand in WebCrawler.brandList:
            brand_count = 0
            for word in filtered:
                if brand in word:
                    brand_count += 1
            brand_dict[brand] = brand_count
            total_count += brand_count

        # convert occurrence to % format
        for brand, count in brand_dict.items():
            brand_dict[brand] = float(count)/total_count

        # compute contribution of each brand to the country domain
        for brand, percentage in brand_dict.items():
            if brand in self.brand_score:
                self.brand_score[brand] += weight * percentage
            else:
                self.brand_score[brand] = weight * percentage

        return True
开发者ID:DdMad,项目名称:WebCrawler,代码行数:87,代码来源:WebCrawler.py

示例2: Summarize

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import lower [as 别名]
class Summarize(object):
	"""docstring for summarize"""
	def __init__(self):		
		self.freq = {}
		self.sentences = []
		self.data = ''
		self.maxRec = 500

	def checkSentence(self,s,x):
		if len(s)>50:
			return False
		for word in self.freq[:x]:
			if not (word[0] in s):
				return False

		return True

	def summarize(self,url):
		self.data = urllib2.urlopen(url).read()
		self.data = Document(self.data).summary()
		
		self.data = MLStripper.strip_tags(self.data).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
		self.data = self.data.lower()
		temp = self.data.split('.')
		

		text = re.findall(r'([a-z]+|\d+)+', self.data)

		for t in temp:
			self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]	

		self.freq = {}

		for word in text:
			if word in self.freq:
				self.freq[word] += 1
			else:
				self.freq[word] = 1

		self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
		self.freq.reverse()
		t = lxml.html.parse(url)
		title = t.find(".//title").text
		return {'title': title, 'summary': self.evaluate(0.01)}

	def summarizeText(self, text):
		self.data = MLStripper.strip_tags(text).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
		self.data = self.data.lower()
		temp = self.data.split('.')
		for t in temp:
			self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]	

		self.freq = {}

		for word in text:
			if word in self.freq:
				self.freq[word] += 1
			else:
				self.freq[word] = 1

		self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
		self.freq.reverse()
		
		return self.evaluate(0.01)

	def evaluate(self,d):
		self.maxRec -= 1
		output = ''

		num = len(self.freq)
		num = int(math.floor(num*d))

		for sentence in self.sentences:
			s = re.findall(r'[a-z]+', sentence)
			if self.checkSentence(s, num) == True and len(sentence) > 2:
				output += sentence[0].upper()+sentence[1:]+'. '	
		if len(self.data)>0:
			compression = 1-(len(output)/len(self.data))
			if self.maxRec >0:
				if compression >= 0.80:
					return self.evaluate(d-.001)

				if compression <= 0.60:
					return self.evaluate(d+.001)

		return output


#s = Summarize()

#print s.summarize('http://www.bbc.co.uk/news/uk-25996176')
开发者ID:DownMoney,项目名称:Jarvis.io,代码行数:93,代码来源:summarize.py


注:本文中的readability.readability.Document.lower方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。