当前位置: 首页>>代码示例>>Python>>正文


Python nltk.clean_html函数代码示例

本文整理汇总了Python中nltk.clean_html函数的典型用法代码示例。如果您正苦于以下问题:Python clean_html函数的具体用法?Python clean_html怎么用?Python clean_html使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了clean_html函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

    def parse(self, fname):
        try:
            with open(fname, "r") as f:
                log.info("Process %s" % fname)
                soup = BeautifulSoup(f.read())
                tbl = soup.find("table", { "class" : "cable" })
                docid = tbl.findAll('tr')[1].\
                        findAll('td')[0].contents[1].contents[0]

                if docid in self.docids:
                    return True

                doc = {
                        "_id": docid,
                        "refererence_id": docid,
                        "date_time": tbl.findAll('tr')[1].\
                                findAll('td')[1].contents[1].contents[0],
                        "classification": tbl.findAll('tr')[1].\
                                findAll('td')[2].contents[1].contents[0],
                        "origin": tbl.findAll('tr')[1].\
                                findAll('td')[3].contents[1].contents[0],
                        "header":nltk.clean_html(str(soup.findAll(['pre'])[0])),
                        "body": nltk.clean_html(str(soup.findAll(['pre'])[1]))
                }
                
                return doc

        except OSError:
            log.error("Can't open '%s'" % fname)
            self.processed -= 1
开发者ID:benoitc,项目名称:cablesgate,代码行数:30,代码来源:cablesgate.py

示例2: extract_content

 def extract_content(self,raw):
   logging.info('Processor.extract_content')
   
   soup = BeautifulSoup(raw)
   cable_table = soup.find("table", { "class" : "cable" })
   cable_id = cable_table.findAll('tr')[1].findAll('td')[0]\
     .contents[1].contents[0]
   if db.cables.find_one({'_id':cable_id}):
     self.counts['files_not_processed'] = self.counts['files_not_processed'] + 1
     logging.info('Processor.extract_content["CABLE ALREADY EXISTS"]')
     self.print_counts()
     return
     
   cable = Cable(raw)
   cable['_id'] = cable_id
   cable['reference_id'] = cable_id
   cable['date_time'] = cable_table.findAll('tr')[1].findAll('td')[1]\
     .contents[1].contents[0]
   cable['classification'] = cable_table.findAll('tr')[1].findAll('td')[2]\
     .contents[1].contents[0]
   cable['origin'] = cable_table.findAll('tr')[1].findAll('td')[3]\
     .contents[1].contents[0]
   cable['header'] = nltk.clean_html(str(soup.findAll(['pre'])[0]))
   cable['body'] = nltk.clean_html(str(soup.findAll(['pre'])[1]))
   
   db.cables.insert(cable.get())
   
   self.counts['files_processed'] = self.counts['files_processed'] + 1
   
   self.print_counts()
   
   if (self.counts['files_processed'] + self.counts['files_not_processed'])\
     == self.counts['files_to_process']:
     self.dump_json()
开发者ID:anarchivist,项目名称:cablegate,代码行数:34,代码来源:process.py

示例3: scrape_links_and_wordlistify

def scrape_links_and_wordlistify(links, lower=False, verbose=1):
    import nltk
    import requests
    import string
    raw = ''
    wordlist = {}
    for site in links:
        try:
            if verbose == 1:
                print '[+] fetching data from: ', site
            if site.find('http://pastebin.com/') == 0:
                raw = requests.get(site.replace('http://pastebin.com/', 'http://pastebin.com/raw.php?i=')).content
            else:
                raw = requests.get(site).content
            if lower == False:
                l = string.translate(nltk.clean_html(raw), string.maketrans(string.punctuation, ' ' * 32)).split()
                freq_an(l, wordlist)
            else:
                l = string.lower(nltk.clean_html(raw))
                l = string.translate(l, string.maketrans(string.punctuation, ' ' * 32)).split()
                freq_an(l, wordlist)
        except:
            if verbose == 1:
                print '[-] Skipping url: ', site
    return wordlist
开发者ID:tkisason,项目名称:unhash,代码行数:25,代码来源:gwordlist.py

示例4: parse_file

  def parse_file(self, filepath):
    """
    Parses a corpus file and initialize the object.
    
    @param  filepath: The path of the corpus file to parse.
    @type   filepath: C{string}
    """

    html_file = codecs.open(filepath, "r", "utf-8")
    raw_html = html_file.read()
    body = raw_html.split("<body>",1)[1]
    raw_content = nltk.clean_html(body.split("</h1>", 1)[1])

    self.set_title(nltk.clean_html(body.split("</h1>", 1)[0]).strip() + ".")
    
    content = ""
    for p in raw_content.split("\n"):
      p = p.strip()

      if p != "":
        if content != "":
          content += " "
        content += p
    content = content.split("-", 1)[1].replace(u"\u202F", " ").strip()

    self.set_content(content)

    html_file.close()
开发者ID:52nlp,项目名称:KeyBench,代码行数:28,代码来源:wikinews2012.py

示例5: scrapeBlog

def scrapeBlog(url, depth): # obs hackkkkkkkkk
    allText = ""
    pages = getPages(url)
    pages = pages[(depth+1):] # take the rest
    posts = []
    timestamps = []
    
    for url in pages:
        response = getContent(url)
        repls = ('januari', 'january'), ('februari', 'february'), ('mars', 'march'), ('maj', 'may'), ('juni', 'june'), ('juli', 'july'), ('augusti', 'august'), ('oktober', 'october')
        response = reduce(lambda a, kv: a.replace(*kv), repls, response.lower())
        
        soup = BeautifulSoup(response)
        
        
        try:
            poststext = soup.select(".blogposttext") # get posts text
            poststext = [nltk.clean_html(unicode(post)) for post in poststext]
            postsdatetime = soup.select(".blogpostheaderdate")
            
            postsdatetime = [nltk.clean_html(unicode(post)) for post in postsdatetime]
            postsdatetime = [parse(post, fuzzy=True) for post in postsdatetime]
            
            posts.extend(poststext[0:len(postsdatetime)])
            timestamps.extend(postsdatetime)
        except:
            pass
        #allText = allText + "\n\n" + getAllText(url)
    
    return posts, timestamps
开发者ID:maxberggren,项目名称:sinus,代码行数:30,代码来源:nattstad_post19.py

示例6: process_feed

    def process_feed(self, entries):
        abbr = self.abbr
        feed_entries = db.feed_entries
        third = itemgetter(2)

        # Find matching entities in the feed.
        for entry, matches in self.scan_feed(entries):                    
            matches = self.extract_entities(matches)

            ids = map(third, matches)
            strings = [m.group() for m, _, _ in matches]
            assert len(ids) == len(strings)

            # Add references and save in mongo.
            
            entry['state'] = abbr # list probably wiser
            entry['entity_ids'] = ids or None
            entry['entity_strings'] = strings or None
            entry['save_time'] = datetime.datetime.utcnow()
            entry['_id'] = new_feed_id(entry)
            entry['_type'] = 'feedentry'

            entry['summary'] = nltk.clean_html(entry['summary'])
            try:
                entry['summary_detail']['value'] = nltk.clean_html(
                    entry['summary_detail']['value'])
            except KeyError:
                pass
            
            feed_entries.save(entry)
            msg = 'Found %d related entities in %r'
            self.logger.info(msg % (len(ids), entry['title']))
开发者ID:kevinthew,项目名称:openstates,代码行数:32,代码来源:scrape.py

示例7: getKeyList

def getKeyList(testID):
    myDataQ = getData(testID,1)
    myDataA = getData(testID,0)

    userKeyQ = getUserAnnotate(myDataQ)
    userKeyA = getUserAnnotate(myDataA)

    myCodeListQ = getCodeList(myDataQ)
    myCodeListA = getCodeList(myDataA)
    myHtml = getHTML(testID)
    
    t1 = []
    packQ = []
    funcQ = []
    for item in myCodeListQ:
        try:
            p,f = cparPack(nltk.clean_html(item))
            packQ += p 
            funcQ += f
        except SyntaxError:
            pass
        t1 += preProCode(item)
    fQ,aQ,vQ,cQ = cparFuncs(t1) 
    packQ,funcQ = cparPack(t1)
    fQ = list(set(fQ))
    aQ = list(set(aQ))
    vQ = list(set(vQ))
    cQ = list(set(cQ))

    combQ = []
    for cItem in cQ:
        for fItem in fQ:
            combQ.append(cItem+"."+fItem) 

    t2 = []
    packA = []
    funcA = []
    for item in myCodeListA:
        try:
            p,f = cparPack(nltk.clean_html(item))
            packA += p 
            funcA += f
        except SyntaxError:
            pass
        t2 += preProCode(item)
    fA,aA,vA,cA = cparFuncs(t2) 
    fA = list(set(fA))
    aA = list(set(aA))
    vA = list(set(vA))
    cA = list(set(cA))

    combA = []
    for cItem in cA:
        for fItem in fA:
            combA.append(cItem+"."+fItem) 

    keyList = \
    list(set(fQ+fA+aQ+aA+vQ+vA+cQ+cA+combQ+combA+packQ+packA+funcQ+funcA+userKeyQ+userKeyA))

    return keyList
开发者ID:paulyang1990,项目名称:FYT-stackoverflow.com-Summarization,代码行数:60,代码来源:ana.py

示例8: getarticle

def getarticle(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html,from_encoding="utf-8")
    titletag = soup.find("h2")
    title = nltk.clean_html("{0}".format(titletag))
    storytag = soup.findAll('div',{'class':None})[1]
    text = nltk.clean_html("{0}".format(storytag))
    return title,text
开发者ID:thequbit,项目名称:newsvis,代码行数:8,代码来源:rhp_scraper.py

示例9: getarticle

def getarticle(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html,from_encoding="utf-8")
    titletag = soup.find("h2")
    title = nltk.clean_html("{0}".format(titletag))
    ptags = soup.find_all("p")
    text = nltk.clean_html("{0}".format(ptags[2]))
    return title,text
开发者ID:thequbit,项目名称:newsvis,代码行数:8,代码来源:whec_scraper.py

示例10: preprocess_hotel_review

def preprocess_hotel_review(file_contents, file_contents_test):
    """
    Hotel review preprocess and truthfulness of the hotel review
    :param file_contents:
    :param file_contents_test:
    """
    raw = clean_html(file_contents)
    raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw)
    sentence_list = tokenize.line_tokenize(raw)
    print sentence_list
    truth_sentences = []
    false_sentences = []
    for sentence in sentence_list:
        sent_arr = re.split(r',', sentence)
        try:
            is_truthful = int(sent_arr[0])
        except ValueError:
            print "is_truthful is not an integer"

        if is_truthful == 1:
            truth_sentences.append(sent_arr[2])
        elif is_truthful == 0:
            false_sentences.append(sent_arr[2])

    truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences))
    false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences))

    raw_test = clean_html(file_contents_test)
    raw_test = re.sub(r'IsTruthFul,review', "", raw_test)
    sentence_list_test = tokenize.line_tokenize(raw_test)
    test_list = []
    test_truth_false_list = []
    truth_count = false_count = i = 0
    for sentence in sentence_list_test:
        sent_arr = re.split(r',', sentence)
        truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict)
        false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict)
        test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex))
        truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0
        #truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0
        if truth_or_false:
            truth_count += 1
        else:
            false_count += 1
        test_truth_false_list.append([i, truth_or_false])
        i += 1

    import csv

    with open("kaggle_sharp.csv", "wb") as f:
        writer = csv.writer(f)
        writer.writerows([['Id', 'Label']])
        writer.writerows(test_truth_false_list)
    print test_list
    print test_truth_false_list
    print truth_count
    print false_count
开发者ID:hs634,项目名称:cs4740,代码行数:57,代码来源:smoothing-ngram.py

示例11: extrait

    def extrait(self, rss):
	d = feedparser.parse(rss)
	h = random.randint(0, len(d['entries']) -1)
	print h
	print str(len(d['entries']))
	titre = nltk.clean_html(d['items'][h].title)
	descriptionb = nltk.clean_html(d['items'][h].description)
	description = re.sub("&#(\d+);", lambda m: chr(int(m.group(1))), descriptionb)
	return titre+". \n\n"+description
开发者ID:appnt,项目名称:SiriServer,代码行数:9,代码来源:lecteurFluxRssFrance.py

示例12: __init__

	def __init__(self,directory):
		#get list of all tags that can be simplified into synonym tags
		stf = open(directory+"tags_synonym.csv", 'r') #converting each tag to its hypernym
		rdr= csv.reader(stf)
		for r in rdr:  
			#r[0]=tag  r[1]=tag it should be replaced with
			self.synonym_tags[r[0]]=r[1]
		stf.close()

		tf=open(directory+"tags.csv", 'r') #assign wieght for tag for each tag
		rdr=csv.reader(tf)
		for r in rdr:
			tmp=r[0].split(';') #tmp[0]=tag      tmp[1]=frequency
			self.tags[tmp[0]]=float(1/float(tmp[1]))
		tf.close()

		for tmp in self.tags:
			t=tmp.split('-')
			if len(t)>1:
				t2=tmp.replace('-',' ')
				#print t2
				if t[0] not in self.complex_tags:
					self.complex_tags[t[0]]=[]

				self.complex_tags[t[0]].append(t2)
				#self.complex_tags_replacements[t[0]]=tmp
				self.complex_tags_replacements[t2]=tmp

		qf=open(directory+"Questions&Answers&Tags.csv",'r')
		rdr=csv.reader(qf)
		for r in rdr: #r[0]:question title r[1]=question title r[2]: best answer r[3]: tags
			if r[0][len(r[0])-1] not in ['!','?','.']:
				r[0]=r[0]+'.'
			r[1]=nltk.clean_html(r[1])
			r[2]=nltk.clean_html(r[2])
			r[0]=r[0]+' '+r[1]
			self.questions.append(r[0])
			self.answers.append(r[1])
			n=len(self.questions)-1
			r[3]=r[3].replace('<','')
			r[3]=r[3].replace('>',' ')
			tmplist=r[3].split(' ')
			for t in tmplist:
				if t in self.synonym_tags:
					r[3]=r[3].replace(t,self.synonym_tags[t])

			tmplist=r[3].split(' ')
			tmplist.pop()
			self.tagsInQuestions[n]=tmplist
			for t in tmplist:
				if t not in self.questionsForTags:
					self.questionsForTags[t]=[]
				self.questionsForTags[t].append(n)

		qf.close()
开发者ID:bijilap,项目名称:Doctor-Tux,代码行数:55,代码来源:DoctorTux.py

示例13: index

def index():
  steps = Step.query.order_by(Step.num_de_paso)
  for step in steps:
    if step.tipo_de_tramite:
      step.tipo_de_tramite = clean_html(step.tipo_de_tramite)
    if step.requisitos:
      step.requisitos = clean_html(step.requisitos)
    if step.consideraciones:
      step.consideraciones = clean_html(step.consideraciones)
    if step.preguntas_frecuentes:
      step.preguntas_frecuentes = clean_html(step.preguntas_frecuentes)
  return render_template('index.html', steps=steps)
开发者ID:CoquiCoders,项目名称:negocio123,代码行数:12,代码来源:negocio123.py

示例14: autos_us

def autos_us():
    html = open('autos-us.html').read()
    soup = BeautifulSoup(html)
    first = soup.find('li').contents[0]
    second = first.parent.next_sibling.next_sibling.contents[0]
    third = second.parent.next_sibling.next_sibling.contents[0]
    majors = [first, second, third]
    minors = soup.select('ul li ul li')
    major_tokens = [nltk.clean_html(str(w)) for w in majors]
    minor_tokens = [nltk.clean_html(str(w)) for w in minors]
    minor_tokens = [re.sub(r'\s\([\S\s]+\)|\[\s\S\s\]|\n\s[A-Za-z]+', r'', token) for token in minor_tokens]
    tokens = list(set(major_tokens + minor_tokens))
    return tokens
开发者ID:cinterloper,项目名称:rap-analysis,代码行数:13,代码来源:counting_cars.py

示例15: gasPrices

def gasPrices(origin, destination):
	one_way_cost = ''
	from_address = origin
	to_address = destination
	new_from_address = from_address.replace(" ", "+")
	new_to_address = to_address.replace(" ", "+")
	url = "http://www.travelmath.com/cost-of-driving/from/" + new_from_address + "/to/" + new_to_address
	html = urllib.urlopen(url)
	for line in html:
		if "costofdriving" and "$" in line:
			one_way_cost = nltk.clean_html(line.split("one-way")[0].replace("$", ""))
			round_trip_cost = nltk.clean_html(line.split("one-way")[1].replace("round trip", "").replace("$", "")).replace('/ ', "")
			break
	return one_way_cost
开发者ID:agadiraju,项目名称:r-3,代码行数:14,代码来源:views.py


注:本文中的nltk.clean_html函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。