本文整理汇总了Python中nltk.clean_html函数的典型用法代码示例。如果您正苦于以下问题:Python clean_html函数的具体用法?Python clean_html怎么用?Python clean_html使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clean_html函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
def parse(self, fname):
try:
with open(fname, "r") as f:
log.info("Process %s" % fname)
soup = BeautifulSoup(f.read())
tbl = soup.find("table", { "class" : "cable" })
docid = tbl.findAll('tr')[1].\
findAll('td')[0].contents[1].contents[0]
if docid in self.docids:
return True
doc = {
"_id": docid,
"refererence_id": docid,
"date_time": tbl.findAll('tr')[1].\
findAll('td')[1].contents[1].contents[0],
"classification": tbl.findAll('tr')[1].\
findAll('td')[2].contents[1].contents[0],
"origin": tbl.findAll('tr')[1].\
findAll('td')[3].contents[1].contents[0],
"header":nltk.clean_html(str(soup.findAll(['pre'])[0])),
"body": nltk.clean_html(str(soup.findAll(['pre'])[1]))
}
return doc
except OSError:
log.error("Can't open '%s'" % fname)
self.processed -= 1
示例2: extract_content
def extract_content(self,raw):
logging.info('Processor.extract_content')
soup = BeautifulSoup(raw)
cable_table = soup.find("table", { "class" : "cable" })
cable_id = cable_table.findAll('tr')[1].findAll('td')[0]\
.contents[1].contents[0]
if db.cables.find_one({'_id':cable_id}):
self.counts['files_not_processed'] = self.counts['files_not_processed'] + 1
logging.info('Processor.extract_content["CABLE ALREADY EXISTS"]')
self.print_counts()
return
cable = Cable(raw)
cable['_id'] = cable_id
cable['reference_id'] = cable_id
cable['date_time'] = cable_table.findAll('tr')[1].findAll('td')[1]\
.contents[1].contents[0]
cable['classification'] = cable_table.findAll('tr')[1].findAll('td')[2]\
.contents[1].contents[0]
cable['origin'] = cable_table.findAll('tr')[1].findAll('td')[3]\
.contents[1].contents[0]
cable['header'] = nltk.clean_html(str(soup.findAll(['pre'])[0]))
cable['body'] = nltk.clean_html(str(soup.findAll(['pre'])[1]))
db.cables.insert(cable.get())
self.counts['files_processed'] = self.counts['files_processed'] + 1
self.print_counts()
if (self.counts['files_processed'] + self.counts['files_not_processed'])\
== self.counts['files_to_process']:
self.dump_json()
示例3: scrape_links_and_wordlistify
def scrape_links_and_wordlistify(links, lower=False, verbose=1):
import nltk
import requests
import string
raw = ''
wordlist = {}
for site in links:
try:
if verbose == 1:
print '[+] fetching data from: ', site
if site.find('http://pastebin.com/') == 0:
raw = requests.get(site.replace('http://pastebin.com/', 'http://pastebin.com/raw.php?i=')).content
else:
raw = requests.get(site).content
if lower == False:
l = string.translate(nltk.clean_html(raw), string.maketrans(string.punctuation, ' ' * 32)).split()
freq_an(l, wordlist)
else:
l = string.lower(nltk.clean_html(raw))
l = string.translate(l, string.maketrans(string.punctuation, ' ' * 32)).split()
freq_an(l, wordlist)
except:
if verbose == 1:
print '[-] Skipping url: ', site
return wordlist
示例4: parse_file
def parse_file(self, filepath):
"""
Parses a corpus file and initialize the object.
@param filepath: The path of the corpus file to parse.
@type filepath: C{string}
"""
html_file = codecs.open(filepath, "r", "utf-8")
raw_html = html_file.read()
body = raw_html.split("<body>",1)[1]
raw_content = nltk.clean_html(body.split("</h1>", 1)[1])
self.set_title(nltk.clean_html(body.split("</h1>", 1)[0]).strip() + ".")
content = ""
for p in raw_content.split("\n"):
p = p.strip()
if p != "":
if content != "":
content += " "
content += p
content = content.split("-", 1)[1].replace(u"\u202F", " ").strip()
self.set_content(content)
html_file.close()
示例5: scrapeBlog
def scrapeBlog(url, depth): # obs hackkkkkkkkk
allText = ""
pages = getPages(url)
pages = pages[(depth+1):] # take the rest
posts = []
timestamps = []
for url in pages:
response = getContent(url)
repls = ('januari', 'january'), ('februari', 'february'), ('mars', 'march'), ('maj', 'may'), ('juni', 'june'), ('juli', 'july'), ('augusti', 'august'), ('oktober', 'october')
response = reduce(lambda a, kv: a.replace(*kv), repls, response.lower())
soup = BeautifulSoup(response)
try:
poststext = soup.select(".blogposttext") # get posts text
poststext = [nltk.clean_html(unicode(post)) for post in poststext]
postsdatetime = soup.select(".blogpostheaderdate")
postsdatetime = [nltk.clean_html(unicode(post)) for post in postsdatetime]
postsdatetime = [parse(post, fuzzy=True) for post in postsdatetime]
posts.extend(poststext[0:len(postsdatetime)])
timestamps.extend(postsdatetime)
except:
pass
#allText = allText + "\n\n" + getAllText(url)
return posts, timestamps
示例6: process_feed
def process_feed(self, entries):
abbr = self.abbr
feed_entries = db.feed_entries
third = itemgetter(2)
# Find matching entities in the feed.
for entry, matches in self.scan_feed(entries):
matches = self.extract_entities(matches)
ids = map(third, matches)
strings = [m.group() for m, _, _ in matches]
assert len(ids) == len(strings)
# Add references and save in mongo.
entry['state'] = abbr # list probably wiser
entry['entity_ids'] = ids or None
entry['entity_strings'] = strings or None
entry['save_time'] = datetime.datetime.utcnow()
entry['_id'] = new_feed_id(entry)
entry['_type'] = 'feedentry'
entry['summary'] = nltk.clean_html(entry['summary'])
try:
entry['summary_detail']['value'] = nltk.clean_html(
entry['summary_detail']['value'])
except KeyError:
pass
feed_entries.save(entry)
msg = 'Found %d related entities in %r'
self.logger.info(msg % (len(ids), entry['title']))
示例7: getKeyList
def getKeyList(testID):
myDataQ = getData(testID,1)
myDataA = getData(testID,0)
userKeyQ = getUserAnnotate(myDataQ)
userKeyA = getUserAnnotate(myDataA)
myCodeListQ = getCodeList(myDataQ)
myCodeListA = getCodeList(myDataA)
myHtml = getHTML(testID)
t1 = []
packQ = []
funcQ = []
for item in myCodeListQ:
try:
p,f = cparPack(nltk.clean_html(item))
packQ += p
funcQ += f
except SyntaxError:
pass
t1 += preProCode(item)
fQ,aQ,vQ,cQ = cparFuncs(t1)
packQ,funcQ = cparPack(t1)
fQ = list(set(fQ))
aQ = list(set(aQ))
vQ = list(set(vQ))
cQ = list(set(cQ))
combQ = []
for cItem in cQ:
for fItem in fQ:
combQ.append(cItem+"."+fItem)
t2 = []
packA = []
funcA = []
for item in myCodeListA:
try:
p,f = cparPack(nltk.clean_html(item))
packA += p
funcA += f
except SyntaxError:
pass
t2 += preProCode(item)
fA,aA,vA,cA = cparFuncs(t2)
fA = list(set(fA))
aA = list(set(aA))
vA = list(set(vA))
cA = list(set(cA))
combA = []
for cItem in cA:
for fItem in fA:
combA.append(cItem+"."+fItem)
keyList = \
list(set(fQ+fA+aQ+aA+vQ+vA+cQ+cA+combQ+combA+packQ+packA+funcQ+funcA+userKeyQ+userKeyA))
return keyList
示例8: getarticle
def getarticle(url):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html,from_encoding="utf-8")
titletag = soup.find("h2")
title = nltk.clean_html("{0}".format(titletag))
storytag = soup.findAll('div',{'class':None})[1]
text = nltk.clean_html("{0}".format(storytag))
return title,text
示例9: getarticle
def getarticle(url):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html,from_encoding="utf-8")
titletag = soup.find("h2")
title = nltk.clean_html("{0}".format(titletag))
ptags = soup.find_all("p")
text = nltk.clean_html("{0}".format(ptags[2]))
return title,text
示例10: preprocess_hotel_review
def preprocess_hotel_review(file_contents, file_contents_test):
"""
Hotel review preprocess and truthfulness of the hotel review
:param file_contents:
:param file_contents_test:
"""
raw = clean_html(file_contents)
raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw)
sentence_list = tokenize.line_tokenize(raw)
print sentence_list
truth_sentences = []
false_sentences = []
for sentence in sentence_list:
sent_arr = re.split(r',', sentence)
try:
is_truthful = int(sent_arr[0])
except ValueError:
print "is_truthful is not an integer"
if is_truthful == 1:
truth_sentences.append(sent_arr[2])
elif is_truthful == 0:
false_sentences.append(sent_arr[2])
truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences))
false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences))
raw_test = clean_html(file_contents_test)
raw_test = re.sub(r'IsTruthFul,review', "", raw_test)
sentence_list_test = tokenize.line_tokenize(raw_test)
test_list = []
test_truth_false_list = []
truth_count = false_count = i = 0
for sentence in sentence_list_test:
sent_arr = re.split(r',', sentence)
truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict)
false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict)
test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex))
truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0
#truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0
if truth_or_false:
truth_count += 1
else:
false_count += 1
test_truth_false_list.append([i, truth_or_false])
i += 1
import csv
with open("kaggle_sharp.csv", "wb") as f:
writer = csv.writer(f)
writer.writerows([['Id', 'Label']])
writer.writerows(test_truth_false_list)
print test_list
print test_truth_false_list
print truth_count
print false_count
示例11: extrait
def extrait(self, rss):
d = feedparser.parse(rss)
h = random.randint(0, len(d['entries']) -1)
print h
print str(len(d['entries']))
titre = nltk.clean_html(d['items'][h].title)
descriptionb = nltk.clean_html(d['items'][h].description)
description = re.sub("&#(\d+);", lambda m: chr(int(m.group(1))), descriptionb)
return titre+". \n\n"+description
示例12: __init__
def __init__(self,directory):
#get list of all tags that can be simplified into synonym tags
stf = open(directory+"tags_synonym.csv", 'r') #converting each tag to its hypernym
rdr= csv.reader(stf)
for r in rdr:
#r[0]=tag r[1]=tag it should be replaced with
self.synonym_tags[r[0]]=r[1]
stf.close()
tf=open(directory+"tags.csv", 'r') #assign wieght for tag for each tag
rdr=csv.reader(tf)
for r in rdr:
tmp=r[0].split(';') #tmp[0]=tag tmp[1]=frequency
self.tags[tmp[0]]=float(1/float(tmp[1]))
tf.close()
for tmp in self.tags:
t=tmp.split('-')
if len(t)>1:
t2=tmp.replace('-',' ')
#print t2
if t[0] not in self.complex_tags:
self.complex_tags[t[0]]=[]
self.complex_tags[t[0]].append(t2)
#self.complex_tags_replacements[t[0]]=tmp
self.complex_tags_replacements[t2]=tmp
qf=open(directory+"Questions&Answers&Tags.csv",'r')
rdr=csv.reader(qf)
for r in rdr: #r[0]:question title r[1]=question title r[2]: best answer r[3]: tags
if r[0][len(r[0])-1] not in ['!','?','.']:
r[0]=r[0]+'.'
r[1]=nltk.clean_html(r[1])
r[2]=nltk.clean_html(r[2])
r[0]=r[0]+' '+r[1]
self.questions.append(r[0])
self.answers.append(r[1])
n=len(self.questions)-1
r[3]=r[3].replace('<','')
r[3]=r[3].replace('>',' ')
tmplist=r[3].split(' ')
for t in tmplist:
if t in self.synonym_tags:
r[3]=r[3].replace(t,self.synonym_tags[t])
tmplist=r[3].split(' ')
tmplist.pop()
self.tagsInQuestions[n]=tmplist
for t in tmplist:
if t not in self.questionsForTags:
self.questionsForTags[t]=[]
self.questionsForTags[t].append(n)
qf.close()
示例13: index
def index():
steps = Step.query.order_by(Step.num_de_paso)
for step in steps:
if step.tipo_de_tramite:
step.tipo_de_tramite = clean_html(step.tipo_de_tramite)
if step.requisitos:
step.requisitos = clean_html(step.requisitos)
if step.consideraciones:
step.consideraciones = clean_html(step.consideraciones)
if step.preguntas_frecuentes:
step.preguntas_frecuentes = clean_html(step.preguntas_frecuentes)
return render_template('index.html', steps=steps)
示例14: autos_us
def autos_us():
html = open('autos-us.html').read()
soup = BeautifulSoup(html)
first = soup.find('li').contents[0]
second = first.parent.next_sibling.next_sibling.contents[0]
third = second.parent.next_sibling.next_sibling.contents[0]
majors = [first, second, third]
minors = soup.select('ul li ul li')
major_tokens = [nltk.clean_html(str(w)) for w in majors]
minor_tokens = [nltk.clean_html(str(w)) for w in minors]
minor_tokens = [re.sub(r'\s\([\S\s]+\)|\[\s\S\s\]|\n\s[A-Za-z]+', r'', token) for token in minor_tokens]
tokens = list(set(major_tokens + minor_tokens))
return tokens
示例15: gasPrices
def gasPrices(origin, destination):
one_way_cost = ''
from_address = origin
to_address = destination
new_from_address = from_address.replace(" ", "+")
new_to_address = to_address.replace(" ", "+")
url = "http://www.travelmath.com/cost-of-driving/from/" + new_from_address + "/to/" + new_to_address
html = urllib.urlopen(url)
for line in html:
if "costofdriving" and "$" in line:
one_way_cost = nltk.clean_html(line.split("one-way")[0].replace("$", ""))
round_trip_cost = nltk.clean_html(line.split("one-way")[1].replace("round trip", "").replace("$", "")).replace('/ ', "")
break
return one_way_cost