Python utils.clean_text函数代码示例

本文整理汇总了Python中utils.clean_text函数的典型用法代码示例。如果您正苦于以下问题：Python clean_text函数的具体用法？Python clean_text怎么用？Python clean_text使用的例子？那么, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了clean_text函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _get_in_charged_commissions

def _get_in_charged_commissions(dico, dico_nl, document):
    document.in_charge_commissions = []
    for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))):
        icc = InChargeCommissions()
        icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
        icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
        icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
        icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
        if dico[key].get("Rapporteur"):
            # FIXME link to actual deputies
            icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t"))

        icc.incident = []
        if dico[key].get("Incident"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        icc.agenda = []
        if dico[key].get("Calendrier"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        if dico[key].get("Rapport"):
            icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])}

        icc.save()
        document.in_charge_commissions.append(icc)

开发者ID:olethanh，项目名称:dierentheater，代码行数:31，代码来源:documents.py

示例2: _get_document_chambre

def _get_document_chambre(dico, dico_nl, document):
    if not dico.get("Document Chambre"):
        return

    chambre_dico = dico['Document Chambre']
    chambre_dico_nl = dico_nl['Document Kamer']

    document_chambre = DocumentChambre()
    document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
    document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
    document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
    document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
    document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
    document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
    document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
    document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
    document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
    document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
    document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')

    _get_authors(chambre_dico, chambre_dico_nl, document_chambre)

    url, tipe, session = clean_text(str(chambre_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)

    if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
        document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
                                                                                                                                             chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]

    document_chambre.save()
    document.document_chambre = document_chambre

开发者ID:olethanh，项目名称:dierentheater，代码行数:35，代码来源:documents.py

示例3: _get_plenaries

def _get_plenaries(dico, dico_nl, document):
    document.plenaries = []
    for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())),
                           sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))):
        pl = DocumentPlenary()
        pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
        pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
        pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
        pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])

        pl.agenda = []
        if dico[key].get("Calendrier"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        pl.incident = []
        if dico[key].get("Incident"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        pl.save()
        document.plenaries.append(pl)

开发者ID:olethanh，项目名称:dierentheater，代码行数:26，代码来源:documents.py

示例4: parse_house_cosponsors

    def parse_house_cosponsors(self, bill, cell):
        # if there's only one sponsor, we don't have to worry about this.
        if (not cell.a.nextSibling or
            not cell.a.nextSibling.nextSibling or
            not 'href' in cell.a.nextSibling.nextSibling):

            cosponsor_dirty = cell.a.em.contents[0]
            cosponsor = clean_text(cosponsor_dirty)
            bill.add_sponsor('cosponsor', cosponsor,
                             sponsor_link=cell.a['href'])
        else:
            # there are several sponsors, and we have to go to the bill text
            bill_text_url = cell.a.nextSibling.nextSibling['href']

            try:
                doc = self.urlopen(bill_text_url)

                # people between (Sponsor) and (Co-Sponsor) are the cosponsors
                m = re.search(r"\(Sponsor\),?(.*)\(Co", doc, re.DOTALL)
                if m:
                    cosponsor_list = clean_text(m.group(1))
                    cosponsor_list = re.split(" ?(?:,| AND ) ?",
                                              cosponsor_list)

                    for cosponsor_dirty in cosponsor_list:
                        cosponsor = clean_text(cosponsor_dirty)
                        bill.add_sponsor('cosponsor', cosponsor)
            except urllib2.HTTPError as e:
                if e.code == 404:
                    # Some of the bill text pages are broken, but the
                    # rest of the bill metadata is valid so just
                    # log the error and move on
                    self.log('404 on %s, continuing' % bill_text_url)
                else:
                    raise e

开发者ID:marlonkeating，项目名称:fiftystates，代码行数:35，代码来源:bills.py

示例5: parse_cosponsors_from_bill

 def parse_cosponsors_from_bill(self, bill, url):
     bill_page = self.urlopen(url)
     bill_page = lxml.html.fromstring(bill_page)
     sponsors_text = find_nodes_with_matching_text(
         bill_page, '//p/span', r'\s*INTRODUCED.*')
     if len(sponsors_text) == 0:
         # probably its withdrawn
         return
     sponsors_text = sponsors_text[0].text_content()
     sponsors = clean_text(sponsors_text).split(',')
     # if there are several comma separated entries, list them.
     if len(sponsors) > 1:
         # the sponsor and the cosponsor were already got from the previous
         # page, so ignore those:
         sponsors = sponsors[2::]
         for part in sponsors:
             parts = re.split(r' (?i)and ', part)
             for sponsor in parts:
                 cosponsor_name = clean_text(sponsor)
                 if cosponsor_name != "":
                     cosponsor_name = cosponsor_name.replace(
                         u'\u00a0', " ")  # epic hax
                     for name in re.split(r'\s+AND\s+', cosponsor_name):
                     # for name in cosponsor_name.split("AND"):
                         name = name.strip()
                         if name:
                             bill.add_sponsor('cosponsor', name)

开发者ID:h4ck3rm1k3，项目名称:openstates，代码行数:27，代码来源:bills.py

示例6: add_text

def add_text(status):
	""" This shorts the text to 140 characters for displaying it in the list control."""
	message = ""
	if status.has_key("copy_history"):
		txt = status["copy_history"][0]["text"]
	else:
		txt = status["text"]
	if len(txt) < 140:
		message = utils.clean_text(txt)
	else:
		message = utils.clean_text(txt[:139])
	return message

开发者ID:manuelcortez，项目名称:socializer，代码行数:12，代码来源:session.py

示例7: _build_sub_section

def _build_sub_section(i, dico):
    sub_section = clean_text(i.td.b.text)
    if dico.get(sub_section):
        raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section]))
    dico[sub_section] = AccessControlDict()
    dico[sub_section]["head"] = i('td')[1]
    return sub_section

开发者ID:mhermans，项目名称:dierentheater，代码行数:7，代码来源:documents_utils.py

示例8: tag_tokens

 def tag_tokens(self, tokens, no_repeats=False):
     """
     Runs the SRL process on the given tokens.
     
     :param tokens: a list of tokens (as strings)
     :param no_repeats: whether to prevent repeated argument labels
     :returns: a list of lists (one list for each sentence). Sentences have tuples 
         (all_tokens, predicate, arg_structure), where arg_structure is a dictionary 
         mapping argument labels to the words it includes.
     """
     tokens_obj = [attributes.Token(utils.clean_text(t, False)) for t in tokens]
     converted_bound = np.array([self.boundary_reader.converter.convert(t) 
                                 for t in tokens_obj])
     converted_class = np.array([self.classify_reader.converter.convert(t) 
                                 for t in tokens_obj])
     
     pred_positions = self.find_predicates(tokens_obj)
     
     # first, argument boundary detection
     # the answer includes all predicates
     answers = self.boundary_nn.tag_sentence(converted_bound, pred_positions)
     boundaries = [[self.boundary_itd[x] for x in pred_answer] 
                   for pred_answer in answers]
     arg_limits = [utils.boundaries_to_arg_limits(pred_boundaries) 
                   for pred_boundaries in boundaries]
     
     # now, argument classification
     answers = self.classify_nn.tag_sentence(converted_class, 
                                             pred_positions, arg_limits,
                                             allow_repeats=not no_repeats)
     arguments = [[self.classify_itd[x] for x in pred_answer] 
                  for pred_answer in answers]
     
     structures = _group_arguments(tokens, pred_positions, boundaries, arguments)
     return SRLAnnotatedSentence(tokens, structures)

开发者ID:chrisleewashere，项目名称:nlpnet，代码行数:35，代码来源:taggers.py

示例9: parse_cosponsors_from_bill

 def parse_cosponsors_from_bill(self, bill, url):
     with self.urlopen(url) as bill_page:
         bill_page = lxml.html.fromstring(bill_page)
         sponsors_text = find_nodes_with_matching_text(bill_page,'//p/span',r'\s*INTRODUCED.*')
         if len(sponsors_text) == 0:
             # probably its withdrawn
             return
         sponsors_text = sponsors_text[0].text_content()
         sponsors = clean_text(sponsors_text).split(',')
         if len(sponsors) > 1: # if there are several comma separated entries, list them.
             # the sponsor and the cosponsor were already got from the previous page, so ignore those:
             sponsors = sponsors[2::]
             for part in sponsors:
                 parts = re.split(r' (?i)and ',part)
                 for sponsor in parts:
                     bill.add_sponsor('cosponsor', clean_text(sponsor))

开发者ID:PamelaM，项目名称:openstates，代码行数:16，代码来源:bills.py

示例10: df_transform

 def df_transform(self, terms):    
     self.df[pd.isnull(self.df['Comment'])] = ""
     self.df = self.df.drop_duplicates('Comment')
     self.df['date'] = self.df['date'].apply(lambda x : unix_convert(x))
     self.df['Comment'] = self.df['Comment'].apply(lambda x: clean_text(str(x)))
     self.df['Sentiment_raw'] = self.df.apply(lambda row: sentiment(row['Comment']), axis = 1)
     self.df['Sentiment'] = self.df.apply(lambda row: sentiment_new(row['Comment'], terms), axis = 1)
     self.df['State'] = self.df.apply(lambda row: state_label(str(row['Locations'])), axis = 1)
     self.df = pd.merge(self.df, self.longlat, how='left', on='State')

开发者ID:nhu2000，项目名称:Project-2，代码行数:9，代码来源:data_cleanup.py

示例11: _get_next_documents

def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre):
    if chambre_dico.get('Document(s) suivant(s)'):
        for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])):
            logger.debug("add pdf %s" % clean_text(d[0].font.text))
            doc = OtherDocumentChambrePdf()
            doc.url = d[0].a['href'] if d[0].a else d[0].td.text
            doc.type["fr"] = clean_text(d[0].font.text)
            doc.type["nl"] = clean_text(d_nl[0].font.text)
            doc.distribution_date = d[1]('td')[-1].text
            for dep, dep_nl in zip(d[2:], d_nl[2:]):
                if dep.a:
                    lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0]
                    deputy = Deputy.objects.get(lachambre_id=lachambre_id)
                    doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
                else:
                    doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
            doc.save()
            document_chambre.other_pdfs.append(doc)

开发者ID:olethanh，项目名称:dierentheater，代码行数:18，代码来源:documents.py

示例12: parse_stations

    def parse_stations(self, html):
        bs = BeautifulSoup(html)
        tables = bs.findAll('table', {'class':'show_fw'})
        st = {}

        for i in range(2):
            trs = tables[i].findAll('tr')
            direction = clean_text(trs[0].text.replace('Fahrtrichtung', ''))
            
            sta = []
            for tr in trs[2:-1]:
                if tr.a:
                    sta.append((clean_text(tr.a.text), defaults.base_url + tr.a['href']))
                else:
                    sta.append((clean_text(tr.text), None))

            st[direction] = sta
        return st

开发者ID:kelvan，项目名称:gotoVienna，代码行数:18，代码来源:realtime.py

示例13: _build_first_level

def _build_first_level(i, dico):
    key = clean_text(i.td.text)
    # we can get severals Moniter erratum
    if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'):
        if not dico.get(key):
            dico[key] = []
        dico[key].append(i('td')[1])
    else:
        if dico.get(key):
            raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key]))
        dico[key] = i('td')[1]

开发者ID:mhermans，项目名称:dierentheater，代码行数:11，代码来源:documents_utils.py

示例14: _get_competences

def _get_competences(dico, dico_nl, document):
    # FIXME: meh, DRY
    if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"):
        document.timeline = []
        for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]],
                                                   [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]):
            logger.debug("append time line %s %s %s" % (_date, _title, _title_nl))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date))
    elif dico.get(u"Compétence"):
        document.timeline = []
        for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]:
            logger.debug("append time line %s %s %s" % (_date, _title, ""))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date))
    elif dico_nl.get(u"Bevoegdheid"):
        document.timeline = []
        for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]:
            logger.debug("append time line %s %s %s" % (_date, "", _title_nl))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date))
    if dico.get("Analyse des interventions"):
        document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])

开发者ID:olethanh，项目名称:dierentheater，代码行数:20，代码来源:documents.py

示例15: _build_pdf_sub_section

def _build_pdf_sub_section(i, dico, sub_section):
    key = clean_text(i.td.text)
    # we can have a list on joined documents
    if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'):
        if not dico[sub_section].get(key):
            dico[sub_section][key] = []
        dico[sub_section][key].append(i('td')[1])
    elif dico[sub_section].get(key):
        raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key]))
    else:
        dico[sub_section][key] = i('td')[1]

开发者ID:mhermans，项目名称:dierentheater，代码行数:11，代码来源:documents_utils.py

注：本文中的utils.clean_text函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。