本文整理匯總了Python中bs4.UnicodeDammit.split方法的典型用法代碼示例。如果您正苦於以下問題:Python UnicodeDammit.split方法的具體用法?Python UnicodeDammit.split怎麽用?Python UnicodeDammit.split使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類bs4.UnicodeDammit
的用法示例。
在下文中一共展示了UnicodeDammit.split方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: _sub_read
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def _sub_read(self, f):
example_num = 0
curr_id = 'EXAMPLE_0'
for line in f:
# Process encoding
if not isinstance(line, text_type):
line = UnicodeDammit(line, ['utf-8',
'windows-1252']).unicode_markup
line = line.strip()
# Handle instance lines
if line.startswith('#'):
curr_id = line[1:].strip()
elif line and line not in ['TRAIN', 'TEST', 'DEV']:
split_line = line.split()
num_cols = len(split_line)
del line
# Line is just a class label
if num_cols == 1:
class_name = safe_float(split_line[0],
replace_dict=self.class_map)
field_pairs = []
# Line has a class label and feature-value pairs
elif num_cols % 2 == 1:
class_name = safe_float(split_line[0],
replace_dict=self.class_map)
field_pairs = split_line[1:]
# Line just has feature-value pairs
elif num_cols % 2 == 0:
class_name = None
field_pairs = split_line
curr_info_dict = {}
if len(field_pairs) > 0:
# Get current instances feature-value pairs
field_names = islice(field_pairs, 0, None, 2)
# Convert values to floats, because otherwise
# features'll be categorical
field_values = (safe_float(val) for val in
islice(field_pairs, 1, None, 2))
# Add the feature-value pairs to dictionary
curr_info_dict.update(zip(field_names, field_values))
if len(curr_info_dict) != len(field_pairs) / 2:
raise ValueError(('There are duplicate feature ' +
'names in {} for example ' +
'{}.').format(self.path_or_list,
curr_id))
yield curr_id, class_name, curr_info_dict
# Set default example ID for next instance, in case we see a
# line without an ID.
example_num += 1
curr_id = 'EXAMPLE_{}'.format(example_num)
示例2: convert_to_libsvm
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def convert_to_libsvm(lines):
'''
Converts a sequence of lines (e.g., a file or list of strings) in MegaM
format to LibSVM format.
:param lines: The sequence of lines to convert.
:type lines: L{file} or L{list} of L{str}
:return: A tuple of the newly formatted data, the mappings from class names
to numbers, and the mappings from feature names to numbers.
:rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict})
'''
# Initialize variables
field_num_dict = UniqueNumberDict()
class_num_dict = UniqueNumberDict()
result_list = []
# Iterate through MegaM file
for line in lines:
line_fields = set()
# Process encoding
line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip()
# Ignore comments (and TEST/DEV lines)
if not line.startswith('#') and not line == 'TEST' and not line == 'DEV':
result_string = ''
split_line = line.split()
result_string += '{0}'.format(class_num_dict[split_line[0]])
# Handle features if there are any
if len(split_line) > 1:
del split_line[0]
# Loop through all feature-value pairs printing out pairs
# separated by commas (and with feature names replaced with
# numbers)
for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)),
(float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))):
# Check for duplicates
if field_num in line_fields:
field_name = (field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next()
raise AssertionError("Field {} occurs on same line twice.".format(field_name))
# Otherwise output non-empty features
elif value != 'N/A' and float(value):
result_string += ' {}:{}'.format(field_num, value)
line_fields.add(field_num)
result_list.append(result_string)
return result_list, class_num_dict, field_num_dict
示例3: main
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def main(argv=None):
"""
Handles command line arguments and gets things started.
Parameters
----------
argv : list of str
List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
"""
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes an input feature file and converts it to another \
format. Formats are determined automatically from file \
extensions.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('infile',
help='input feature file (ends in .arff, .csv, \
.jsonlines, .libsvm, .megam, .ndj, or .tsv)')
parser.add_argument('outfile',
help='output feature file (ends in .arff, .csv, \
.jsonlines, .libsvm, .megam, .ndj, or .tsv)')
parser.add_argument('-i', '--id_col',
help='Name of the column which contains the instance \
IDs in ARFF, CSV, or TSV files.',
default='id')
label_group = parser.add_mutually_exclusive_group(required=False)
label_group.add_argument('-l',
'--label_col',
help='Name of the column which contains the class \
labels in ARFF, CSV, or TSV files. For ARFF \
files, this must be the final column to count as\
the label.',
default='y')
label_group.add_argument('--no_labels',
action='store_true',
default=False,
help='Used to indicate that the input data has no labels.')
parser.add_argument('-q', '--quiet',
help='Suppress printing of "Loading..." messages.',
action='store_true')
parser.add_argument('--arff_regression',
help='Create ARFF files for regression, not \
classification.',
action='store_true')
parser.add_argument('--arff_relation',
help='Relation name to use for ARFF file.',
default='skll_relation')
parser.add_argument('--reuse_libsvm_map',
help='If you want to output multiple files that use \
the same mapping from labels and features to \
numbers when writing libsvm files, you can \
specify an existing .libsvm file to reuse the \
mapping from.',
type=argparse.FileType('rb'))
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
# Make warnings from built-in warnings module get formatted more nicely
logging.captureWarnings(True)
logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
'%(message)s'))
logger = logging.getLogger(__name__)
# make sure the input file extension is one we can process
input_extension = os.path.splitext(args.infile)[1].lower()
output_extension = os.path.splitext(args.outfile)[1].lower()
if input_extension not in EXT_TO_READER:
logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
'.libsvm, .megam, .ndj, or .tsv format. You specified: '
'{}').format(input_extension))
sys.exit(1)
# Build feature and label vectorizers from existing libsvm file if asked
if args.reuse_libsvm_map and output_extension == '.libsvm':
feat_map = {}
label_map = {}
for line in args.reuse_libsvm_map:
line = UnicodeDammit(line, ['utf-8',
'windows-1252']).unicode_markup
if '#' not in line:
logger.error('The LibSVM file you want to reuse the map from '
'was not created by SKLL and does not actually '
'contain the necessary mapping info.')
sys.exit(1)
comments = line.split('#')[1]
_, label_map_str, feat_map_str = comments.split('|')
feat_map.update(_pair_to_dict_tuple(pair) for pair in
feat_map_str.strip().split())
label_map.update(_pair_to_dict_tuple(pair) for pair in
label_map_str
.strip().split())
feat_vectorizer = DictVectorizer()
feat_vectorizer.fit([{name: 1} for name in feat_map])
feat_vectorizer.vocabulary_ = feat_map
else:
feat_vectorizer = None
#.........這裏部分代碼省略.........
示例4: on_pubmsg
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def on_pubmsg(self, c, e):
nick = e.source.nick
target = e.target if is_channel(e.target) else nick
def reply(msg):
self.send(target, msg)
def dm(msg):
self.send(nick, msg)
line = UnicodeDammit(e.arguments[0]).unicode_markup
log(' \033[37m{}→{}\033[0m'.format(nick, line))
a = line.split(":", 1)
if len(a) > 1 and a[0].lower() == self.nick:
self.do_command(e, a[1].strip().lower(), nick, target, reply, dm)
return
# zeltofilter
if 'zeltoph' in nick:
return
foo = settings.VIPS.get(nick, 0)
if random() < foo:
self.kick(nick)
match = re.match('.*┻━┻.*', line)
if match:
reply('┬─┬ノ(ಠ_ಠノ)')
return
match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line)
if match:
newcs = match.group(3)
self.chaossternchen.append(newcs)
self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs))
return
if line.startswith('.wiki '):
wikipage = line[len('.wiki '):].strip()
if re.match('^[-_+\w]+$', wikipage):
wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage)
if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text:
reply("I'm sorry, I can't find a wiki page with that name.")
else:
reply(wikiurl)
else:
reply('Try to troll somebot else.')
return
if line == 'wat?':
reply("I don't have a clue.")
return
if re.match('^hail eris[.!]* ', line.lower()):
reply("All Hail Discordia!")
return
m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE)
for _1,match,_2 in m:
if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE):
self.kick(nick, "It's spelled Gandhi")
return
if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()):
reply('A facebook link? srsly? Get some self-respect!')
return
match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower())
if match:
reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1))
return
if line == 'moin':
self.moincount += 1
if self.moincount == 5:
reply('moin')
return
else:
self.moincount = 0
if line.lstrip('.!#').startswith('eta '):
eta = line[4:].strip()
with self.db as db:
db.execute("DELETE FROM etas WHERE nick=?", (nick,))
if eta:
db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta))
dm('ETA registered. Thanks!')
return
m = re.findall(URL_REGEX, line.lower())
for url,*_ in m:
res = requests.get(url)
if res.status_code == requests.codes.ok:
soup = BeautifulSoup(res.text)
reply(soup.title.string)
m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE)
for _1,match,_2 in m:
if match != 'AfRA' and match != 'afra' and random() < 0.1:
reply("I'm sure you meant AfRA, not "+match)
return
示例5: main
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def main(argv=None):
'''
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
'''
# Get command line arguments
parser = argparse.ArgumentParser(description="Takes an input feature file \
and converts it to another \
format. Formats are \
determined automatically from\
file extensions.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('infile',
help='input feature file (ends in .jsonlines, .tsv, \
.csv, .arff, or .megam)')
parser.add_argument('outfile',
help='output feature file (ends in .jsonlines, .tsv, \
.csv, .arff, or .megam)')
parser.add_argument('-l', '--label_col',
help='Name of the column which contains the class \
labels in ARFF, CSV, or TSV files. For ARFF \
files, this must be the final column to count as\
the label.',
default='y')
parser.add_argument('-q', '--quiet',
help='Suppress printing of "Loading..." messages.',
action='store_true')
parser.add_argument('--arff_regression',
help='Create ARFF files for regression, not classification.',
action='store_true')
parser.add_argument('--arff_relation',
help='Relation name to use for ARFF file.',
default='skll_relation')
parser.add_argument('--reuse_libsvm_map',
help='If you want to output multiple files that use \
the same mapping from classes and features to \
numbers when writing libsvm files, you can \
specify an existing .libsvm file to reuse the \
mapping from.',
type=argparse.FileType('rb'))
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
# Make warnings from built-in warnings module get formatted more nicely
logging.captureWarnings(True)
logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
'%(message)s'))
logger = logging.getLogger(__name__)
# make sure the input file extension is one we can process
input_extension = os.path.splitext(args.infile)[1].lower()
output_extension = os.path.splitext(args.outfile)[1].lower()
if input_extension == ".tsv":
example_iter_type = _TSVDictIter
elif input_extension == ".jsonlines" or input_extension == '.ndj':
example_iter_type = _JSONDictIter
elif input_extension == ".libsvm":
example_iter_type = _LibSVMDictIter
elif input_extension == ".megam":
example_iter_type = _MegaMDictIter
elif input_extension == ".csv":
example_iter_type = _CSVDictIter
elif input_extension == ".arff":
example_iter_type = _ARFFDictIter
else:
logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
'.libsvm, .megam, .ndj, or .tsv format. You specified: '
'{}').format(input_extension))
sys.exit(1)
# Build feature and label vectorizers from existing libsvm file if asked
if args.reuse_libsvm_map and output_extension == '.libsvm':
feat_map = {}
label_map = {}
for line in args.reuse_libsvm_map:
line = UnicodeDammit(line, ['utf-8',
'windows-1252']).unicode_markup
if '#' not in line:
logger.error('The LibSVM file you want to reuse the map from '
'was not created by SKLL and does not actually '
'contain the necessary mapping info.')
sys.exit(1)
comments = line.split('#')[1]
_, label_map_str, feat_map_str = comments.split('|')
feat_map.update(_pair_to_dict_tuple(pair) for pair in
feat_map_str.strip())
label_map.update(_pair_to_dict_tuple(pair) for pair in
label_map_str
.strip())
feat_vectorizer = DictVectorizer()
feat_vectorizer.fit([{name: 1} for name in feat_map])
feat_vectorizer.vocabulary_ = feat_map
else:
feat_vectorizer = None
label_map = None
#.........這裏部分代碼省略.........
示例6: UnicodeDammit
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
if(isinstance(block, Tag)):
continue
# UnicodeDammit converts any string to UTF-8
# does not work so well
block = UnicodeDammit(block, soup.original_encoding).unicode_markup
# remove leading and ending end of lines
block = block.strip('\n')
# if the block doesn't have any text, skip it
if( re.search('\w', block) == None ):
continue
# bs4 ne coupe pas toujours bien les différents blocs
# Mieux vaut donc redécouper par paragraphe et les traiter un à un
for line in block.split('\n'):
stripped_line = line.strip(' \n\t\r')
if( re.search('\w', line) == None ):
continue
print('------------------------------ Begin line ------------------------------')
print(line)
print(' ------- End line -------')
if( is_intro ):
print()
answer = input("Is that still part of the intro? (Y/n) ")
if(answer == 'n' or answer == 'N'):
is_intro = False
movie_script.append({