本文整理汇总了Python中utils.clean_text函数的典型用法代码示例。如果您正苦于以下问题:Python clean_text函数的具体用法?Python clean_text怎么用?Python clean_text使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clean_text函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_in_charged_commissions
def _get_in_charged_commissions(dico, dico_nl, document):
document.in_charge_commissions = []
for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))):
icc = InChargeCommissions()
icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
if dico[key].get("Rapporteur"):
# FIXME link to actual deputies
icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t"))
icc.incident = []
if dico[key].get("Incident"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
icc.agenda = []
if dico[key].get("Calendrier"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
if dico[key].get("Rapport"):
icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])}
icc.save()
document.in_charge_commissions.append(icc)
示例2: _get_document_chambre
def _get_document_chambre(dico, dico_nl, document):
if not dico.get("Document Chambre"):
return
chambre_dico = dico['Document Chambre']
chambre_dico_nl = dico_nl['Document Kamer']
document_chambre = DocumentChambre()
document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')
_get_authors(chambre_dico, chambre_dico_nl, document_chambre)
url, tipe, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />")
_, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace(" ", "")).split("<br />")
url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])
_get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)
if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]
document_chambre.save()
document.document_chambre = document_chambre
示例3: _get_plenaries
def _get_plenaries(dico, dico_nl, document):
document.plenaries = []
for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())),
sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))):
pl = DocumentPlenary()
pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
pl.agenda = []
if dico[key].get("Calendrier"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
pl.incident = []
if dico[key].get("Incident"):
fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
for (_date, _type), (_, _type_nl) in zip(fr, nl):
pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})
pl.save()
document.plenaries.append(pl)
示例4: parse_house_cosponsors
def parse_house_cosponsors(self, bill, cell):
# if there's only one sponsor, we don't have to worry about this.
if (not cell.a.nextSibling or
not cell.a.nextSibling.nextSibling or
not 'href' in cell.a.nextSibling.nextSibling):
cosponsor_dirty = cell.a.em.contents[0]
cosponsor = clean_text(cosponsor_dirty)
bill.add_sponsor('cosponsor', cosponsor,
sponsor_link=cell.a['href'])
else:
# there are several sponsors, and we have to go to the bill text
bill_text_url = cell.a.nextSibling.nextSibling['href']
try:
doc = self.urlopen(bill_text_url)
# people between (Sponsor) and (Co-Sponsor) are the cosponsors
m = re.search(r"\(Sponsor\),?(.*)\(Co", doc, re.DOTALL)
if m:
cosponsor_list = clean_text(m.group(1))
cosponsor_list = re.split(" ?(?:,| AND ) ?",
cosponsor_list)
for cosponsor_dirty in cosponsor_list:
cosponsor = clean_text(cosponsor_dirty)
bill.add_sponsor('cosponsor', cosponsor)
except urllib2.HTTPError as e:
if e.code == 404:
# Some of the bill text pages are broken, but the
# rest of the bill metadata is valid so just
# log the error and move on
self.log('404 on %s, continuing' % bill_text_url)
else:
raise e
示例5: parse_cosponsors_from_bill
def parse_cosponsors_from_bill(self, bill, url):
bill_page = self.urlopen(url)
bill_page = lxml.html.fromstring(bill_page)
sponsors_text = find_nodes_with_matching_text(
bill_page, '//p/span', r'\s*INTRODUCED.*')
if len(sponsors_text) == 0:
# probably its withdrawn
return
sponsors_text = sponsors_text[0].text_content()
sponsors = clean_text(sponsors_text).split(',')
# if there are several comma separated entries, list them.
if len(sponsors) > 1:
# the sponsor and the cosponsor were already got from the previous
# page, so ignore those:
sponsors = sponsors[2::]
for part in sponsors:
parts = re.split(r' (?i)and ', part)
for sponsor in parts:
cosponsor_name = clean_text(sponsor)
if cosponsor_name != "":
cosponsor_name = cosponsor_name.replace(
u'\u00a0', " ") # epic hax
for name in re.split(r'\s+AND\s+', cosponsor_name):
# for name in cosponsor_name.split("AND"):
name = name.strip()
if name:
bill.add_sponsor('cosponsor', name)
示例6: add_text
def add_text(status):
""" This shorts the text to 140 characters for displaying it in the list control."""
message = ""
if status.has_key("copy_history"):
txt = status["copy_history"][0]["text"]
else:
txt = status["text"]
if len(txt) < 140:
message = utils.clean_text(txt)
else:
message = utils.clean_text(txt[:139])
return message
示例7: _build_sub_section
def _build_sub_section(i, dico):
sub_section = clean_text(i.td.b.text)
if dico.get(sub_section):
raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section]))
dico[sub_section] = AccessControlDict()
dico[sub_section]["head"] = i('td')[1]
return sub_section
示例8: tag_tokens
def tag_tokens(self, tokens, no_repeats=False):
"""
Runs the SRL process on the given tokens.
:param tokens: a list of tokens (as strings)
:param no_repeats: whether to prevent repeated argument labels
:returns: a list of lists (one list for each sentence). Sentences have tuples
(all_tokens, predicate, arg_structure), where arg_structure is a dictionary
mapping argument labels to the words it includes.
"""
tokens_obj = [attributes.Token(utils.clean_text(t, False)) for t in tokens]
converted_bound = np.array([self.boundary_reader.converter.convert(t)
for t in tokens_obj])
converted_class = np.array([self.classify_reader.converter.convert(t)
for t in tokens_obj])
pred_positions = self.find_predicates(tokens_obj)
# first, argument boundary detection
# the answer includes all predicates
answers = self.boundary_nn.tag_sentence(converted_bound, pred_positions)
boundaries = [[self.boundary_itd[x] for x in pred_answer]
for pred_answer in answers]
arg_limits = [utils.boundaries_to_arg_limits(pred_boundaries)
for pred_boundaries in boundaries]
# now, argument classification
answers = self.classify_nn.tag_sentence(converted_class,
pred_positions, arg_limits,
allow_repeats=not no_repeats)
arguments = [[self.classify_itd[x] for x in pred_answer]
for pred_answer in answers]
structures = _group_arguments(tokens, pred_positions, boundaries, arguments)
return SRLAnnotatedSentence(tokens, structures)
示例9: parse_cosponsors_from_bill
def parse_cosponsors_from_bill(self, bill, url):
with self.urlopen(url) as bill_page:
bill_page = lxml.html.fromstring(bill_page)
sponsors_text = find_nodes_with_matching_text(bill_page,'//p/span',r'\s*INTRODUCED.*')
if len(sponsors_text) == 0:
# probably its withdrawn
return
sponsors_text = sponsors_text[0].text_content()
sponsors = clean_text(sponsors_text).split(',')
if len(sponsors) > 1: # if there are several comma separated entries, list them.
# the sponsor and the cosponsor were already got from the previous page, so ignore those:
sponsors = sponsors[2::]
for part in sponsors:
parts = re.split(r' (?i)and ',part)
for sponsor in parts:
bill.add_sponsor('cosponsor', clean_text(sponsor))
示例10: df_transform
def df_transform(self, terms):
self.df[pd.isnull(self.df['Comment'])] = ""
self.df = self.df.drop_duplicates('Comment')
self.df['date'] = self.df['date'].apply(lambda x : unix_convert(x))
self.df['Comment'] = self.df['Comment'].apply(lambda x: clean_text(str(x)))
self.df['Sentiment_raw'] = self.df.apply(lambda row: sentiment(row['Comment']), axis = 1)
self.df['Sentiment'] = self.df.apply(lambda row: sentiment_new(row['Comment'], terms), axis = 1)
self.df['State'] = self.df.apply(lambda row: state_label(str(row['Locations'])), axis = 1)
self.df = pd.merge(self.df, self.longlat, how='left', on='State')
示例11: _get_next_documents
def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre):
if chambre_dico.get('Document(s) suivant(s)'):
for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])):
logger.debug("add pdf %s" % clean_text(d[0].font.text))
doc = OtherDocumentChambrePdf()
doc.url = d[0].a['href'] if d[0].a else d[0].td.text
doc.type["fr"] = clean_text(d[0].font.text)
doc.type["nl"] = clean_text(d_nl[0].font.text)
doc.distribution_date = d[1]('td')[-1].text
for dep, dep_nl in zip(d[2:], d_nl[2:]):
if dep.a:
lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0]
deputy = Deputy.objects.get(lachambre_id=lachambre_id)
doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
else:
doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
doc.save()
document_chambre.other_pdfs.append(doc)
示例12: parse_stations
def parse_stations(self, html):
bs = BeautifulSoup(html)
tables = bs.findAll('table', {'class':'show_fw'})
st = {}
for i in range(2):
trs = tables[i].findAll('tr')
direction = clean_text(trs[0].text.replace('Fahrtrichtung', ''))
sta = []
for tr in trs[2:-1]:
if tr.a:
sta.append((clean_text(tr.a.text), defaults.base_url + tr.a['href']))
else:
sta.append((clean_text(tr.text), None))
st[direction] = sta
return st
示例13: _build_first_level
def _build_first_level(i, dico):
key = clean_text(i.td.text)
# we can get severals Moniter erratum
if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'):
if not dico.get(key):
dico[key] = []
dico[key].append(i('td')[1])
else:
if dico.get(key):
raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key]))
dico[key] = i('td')[1]
示例14: _get_competences
def _get_competences(dico, dico_nl, document):
# FIXME: meh, DRY
if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"):
document.timeline = []
for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]],
[clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]):
logger.debug("append time line %s %s %s" % (_date, _title, _title_nl))
document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date))
elif dico.get(u"Compétence"):
document.timeline = []
for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]:
logger.debug("append time line %s %s %s" % (_date, _title, ""))
document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date))
elif dico_nl.get(u"Bevoegdheid"):
document.timeline = []
for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]:
logger.debug("append time line %s %s %s" % (_date, "", _title_nl))
document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date))
if dico.get("Analyse des interventions"):
document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])
示例15: _build_pdf_sub_section
def _build_pdf_sub_section(i, dico, sub_section):
key = clean_text(i.td.text)
# we can have a list on joined documents
if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'):
if not dico[sub_section].get(key):
dico[sub_section][key] = []
dico[sub_section][key].append(i('td')[1])
elif dico[sub_section].get(key):
raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key]))
else:
dico[sub_section][key] = i('td')[1]