本文整理汇总了Python中ftfy.fix_text方法的典型用法代码示例。如果您正苦于以下问题:Python ftfy.fix_text方法的具体用法?Python ftfy.fix_text怎么用?Python ftfy.fix_text使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类ftfy
的用法示例。
在下文中一共展示了ftfy.fix_text方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: encode
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def encode(self, texts, verbose=True):
texts_tokens = []
if verbose:
for text in tqdm(texts, ncols=80, leave=False):
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend(
[self.encoder.get(t, 0) for t in
self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
else:
for text in texts:
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend(
[self.encoder.get(t, 0) for t in
self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
return texts_tokens
示例2: cnndm_generator
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def cnndm_generator(mode, seed=0, shuffle=False, comm=None):
# data originally from https://github.com/abisee/cnn-dailymail
if mode == 'valid':
mode = 'val'
with open(gcs.download_file_cached(f'gs://lm-human-preferences/datasets/cnndm/url_lists/all_{mode}.txt', comm=comm)) as f:
urls = [line.strip() for line in f]
if shuffle:
random.seed(seed)
random.shuffle(urls)
# if n_eval > 0:
# urls = urls[:n_eval]
urls_dir = gcs.download_directory_cached(f'gs://lm-human-preferences/datasets/cnndm/cache_{mode}', comm=comm)
for i, url in enumerate(urls):
path = os.path.join(urls_dir, get_path_of_url(url))
text = open(path).read()
text = clean_up_start(text)
text = ftfy.fix_text(text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = text.split('@highlight')[0].strip()
yield text
# _, ref_sents = get_art_abs(path)
示例3: tldr_generator
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def tldr_generator(mode, seed=0, shuffle=False, comm=None):
random.seed(seed)
if mode == 'test':
mode = 'valid' # validation set serves as training set, since we don't have access..
assert mode in ['train', 'valid']
with open(gcs.download_file_cached(f'gs://lm-human-preferences/tldr/{mode}-subset.json', comm=comm)) as f:
datas = json.load(f)
if shuffle:
random.seed(seed)
random.shuffle(datas)
for data in datas:
text = data['content']
text = ftfy.fix_text(text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = text.strip()
yield text
示例4: __init__
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
try:
import ftfy
import spacy
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True,
never_split=special_tokens if special_tokens is not None else [])
self.fix_text = None
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)
示例5: __init__
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
try:
import ftfy
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
示例6: encode
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def encode(self, texts, verbose=True):
texts_tokens = []
if verbose:
for text in tqdm(texts, ncols=80, leave=False):
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
else:
for text in texts:
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
return texts_tokens
示例7: encode
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def encode(self, texts, verbose=True):
texts_tokens = []
if verbose:
for text in tqdm(texts, ncols=80, leave=False):
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend(
[self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
else:
for text in texts:
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend(
[self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
return texts_tokens
示例8: __init__
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
try:
import ftfy
import spacy
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True,
never_split=special_tokens if special_tokens is not None else [])
self.fix_text = None
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
self.set_special_tokens(special_tokens)
示例9: summarize_text
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def summarize_text(request):
if request.html:
parser = HtmlParser.from_file(file_path=request.html,
url=request.url,
tokenizer=Tokenizer(LANGUAGE))
else:
parser = PlaintextParser.from_file(file_path=request.html,
tokenizer=Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)]
html = generate_html(sentences, fix_text(request.title)).render()
request.send_html(html)
示例10: __init__
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
try:
import ftfy
import spacy
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
示例11: get_modified_content
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def get_modified_content(self, format="srt", debug=False):
"""
:return: string
"""
if not self.mods:
return fix_text(self.content.decode(encoding=self._guessed_encoding), **ftfy_defaults).encode(
encoding=self._guessed_encoding)
submods = SubtitleModifications(debug=debug)
if submods.load(content=self.text, language=self.language):
logger.info("Applying mods: %s", self.mods)
submods.modify(*self.mods)
self.mods = submods.mods_used
content = fix_text(self.pysubs2_to_unicode(submods.f, format=format), **ftfy_defaults)\
.encode(encoding=self._guessed_encoding)
submods.f = None
del submods
return content
return None
示例12: test
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def test(data):
text, expected = data
text2 = text.encode('cp437').decode('utf-8')
text3 = unidecode(text2)
text4 = unicodedata.normalize('NFC', text2)
text5 = unidecode(text4)
print(' text:', text, '| len:', len(text))
print(' expected:', expected, ' | len:', len(expected))
print(' text == expected:', text == expected)
print('-------------------------------------')
print('text.encode("cp437").decode("utf-8"):', text2, ' | len:', len(text2), '| expected:', text2 == expected)
print(' unicode(text2):', text3, ' | len:', len(text3), '| expected:', text3 == expected)
print('-------------------------------------')
print(' unicodedata.normalize("NFC", text2):', text4, ' | len:', len(text4), '| expected:', text4 == expected)
print(' unicode(text4):', text5, ' | len:', len(text5), '| expected:', text5 == expected)
print('-------------------------------------')
print(' ftfy.fix_text(text):', ftfy.fix_text(text))
print('-------------------------------------')
示例13: search_citation
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def search_citation(text, exp):
'''Finds sentences around citations, where the regexp `exp matches'''
text = text.decode('utf-8')
lines = text.split('\n')
text = ' '.join(lines)
text = ' '.join(text.split())
text = ftfy.fix_text(text)
logging.info("Search...'{0!s}'".format(exp))
sentences = split_sentences(text)
regex = re.compile(exp, flags=(re.I))
founds = set()
for sent in sentences:
if regex.search(sent):
founds.add(sent)
return founds
示例14: fix_string
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def fix_string(val):
if isinstance(val, list):
val = [fixCase(tmp) for tmp in val]
return val
if not val:
return val
val = fixUnicodeSpaces(val)
val = fixSmartQuotes(val)
val = fixCase(val)
val = fixHtmlEntities(val)
val = ftfy.fix_text(val)
return val
示例15: set_special_tokens
# 需要导入模块: import ftfy [as 别名]
# 或者: from ftfy import fix_text [as 别名]
def set_special_tokens(self, special_tokens):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if not special_tokens:
self.special_tokens = {}
self.special_tokens_decoder = {}
return
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
if self.fix_text is None:
# Using BERT's BasicTokenizer: we can update the tokenizer
self.nlp.never_split = special_tokens
logger.info("Special tokens {}".format(self.special_tokens))