Python UnicodeDammit.split方法代碼示例

本文整理匯總了Python中bs4.UnicodeDammit.split方法的典型用法代碼示例。如果您正苦於以下問題：Python UnicodeDammit.split方法的具體用法？Python UnicodeDammit.split怎麽用？Python UnicodeDammit.split使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類bs4.UnicodeDammit的用法示例。

在下文中一共展示了UnicodeDammit.split方法的6個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: _sub_read

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
    def _sub_read(self, f):
        example_num = 0
        curr_id = 'EXAMPLE_0'
        for line in f:
            # Process encoding
            if not isinstance(line, text_type):
                line = UnicodeDammit(line, ['utf-8',
                                            'windows-1252']).unicode_markup
            line = line.strip()
            # Handle instance lines
            if line.startswith('#'):
                curr_id = line[1:].strip()
            elif line and line not in ['TRAIN', 'TEST', 'DEV']:
                split_line = line.split()
                num_cols = len(split_line)
                del line
                # Line is just a class label
                if num_cols == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = []
                # Line has a class label and feature-value pairs
                elif num_cols % 2 == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = split_line[1:]
                # Line just has feature-value pairs
                elif num_cols % 2 == 0:
                    class_name = None
                    field_pairs = split_line

                curr_info_dict = {}
                if len(field_pairs) > 0:
                    # Get current instances feature-value pairs
                    field_names = islice(field_pairs, 0, None, 2)
                    # Convert values to floats, because otherwise
                    # features'll be categorical
                    field_values = (safe_float(val) for val in
                                    islice(field_pairs, 1, None, 2))

                    # Add the feature-value pairs to dictionary
                    curr_info_dict.update(zip(field_names, field_values))

                    if len(curr_info_dict) != len(field_pairs) / 2:
                        raise ValueError(('There are duplicate feature ' +
                                          'names in {} for example ' +
                                          '{}.').format(self.path_or_list,
                                                        curr_id))

                yield curr_id, class_name, curr_info_dict

                # Set default example ID for next instance, in case we see a
                # line without an ID.
                example_num += 1
                curr_id = 'EXAMPLE_{}'.format(example_num)

開發者ID:hefeix，項目名稱:skll，代碼行數:57，代碼來源:readers.py

示例2: convert_to_libsvm

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def convert_to_libsvm(lines):
    '''
    Converts a sequence of lines (e.g., a file or list of strings) in MegaM
    format to LibSVM format.

    :param lines: The sequence of lines to convert.
    :type lines: L{file} or L{list} of L{str}

    :return: A tuple of the newly formatted data, the mappings from class names
             to numbers, and the mappings from feature names to numbers.
    :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict})
    '''

    # Initialize variables
    field_num_dict = UniqueNumberDict()
    class_num_dict = UniqueNumberDict()

    result_list = []
    # Iterate through MegaM file
    for line in lines:
        line_fields = set()
        # Process encoding
        line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip()

        # Ignore comments (and TEST/DEV lines)
        if not line.startswith('#') and not line == 'TEST' and not line == 'DEV':
            result_string = ''
            split_line = line.split()
            result_string += '{0}'.format(class_num_dict[split_line[0]])
            # Handle features if there are any
            if len(split_line) > 1:
                del split_line[0]
                # Loop through all feature-value pairs printing out pairs
                # separated by commas (and with feature names replaced with
                # numbers)
                for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)),
                                                   (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))):
                    # Check for duplicates
                    if field_num in line_fields:
                        field_name = (field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next()
                        raise AssertionError("Field {} occurs on same line twice.".format(field_name))
                    # Otherwise output non-empty features
                    elif value != 'N/A' and float(value):
                        result_string += ' {}:{}'.format(field_num, value)
                        line_fields.add(field_num)
            result_list.append(result_string)

    return result_list, class_num_dict, field_num_dict

開發者ID:manugarri，項目名稱:skll，代碼行數:50，代碼來源:megam_to_libsvm.py

示例3: main

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('-i', '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    label_group = parser.add_mutually_exclusive_group(required=False)
    label_group.add_argument('-l',
                             '--label_col',
                             help='Name of the column which contains the class \
                                   labels in ARFF, CSV, or TSV files. For ARFF \
                                   files, this must be the final column to count as\
                                   the label.',
                             default='y')
    label_group.add_argument('--no_labels',
                             action='store_true',
                             default=False,
                             help='Used to indicate that the input data has no labels.')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip().split())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
#.........這裏部分代碼省略.........

開發者ID:EducationalTestingService，項目名稱:skll，代碼行數:103，代碼來源:skll_convert.py

示例4: on_pubmsg

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
	def on_pubmsg(self, c, e):
		nick = e.source.nick
		target = e.target if is_channel(e.target) else nick
		def reply(msg):
			self.send(target, msg)
		def dm(msg):
			self.send(nick, msg)
		line = UnicodeDammit(e.arguments[0]).unicode_markup
		log('   \033[37m{}→{}\033[0m'.format(nick, line))
		a = line.split(":", 1)
		if len(a) > 1 and a[0].lower() == self.nick:
			self.do_command(e, a[1].strip().lower(), nick, target, reply, dm)
			return

		# zeltofilter
		if 'zeltoph' in nick:
			return

		foo = settings.VIPS.get(nick, 0)
		if random() < foo:
			self.kick(nick)
	
		match = re.match('.*┻━┻.*', line)
		if match:
			reply('┬─┬ノ(ಠ_ಠノ)')
			return

		match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line)
		if match:
			newcs = match.group(3)
			self.chaossternchen.append(newcs)
			self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs))
			return

		if line.startswith('.wiki '):
			wikipage = line[len('.wiki '):].strip()
			if re.match('^[-_+\w]+$', wikipage):
				wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage)
				if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text:
					reply("I'm sorry, I can't find a wiki page with that name.")
				else:
					reply(wikiurl)
			else:
				reply('Try to troll somebot else.')
			return

		if line == 'wat?':
			reply("I don't have a clue.")
			return
		if re.match('^hail eris[.!]* ', line.lower()):
			reply("All Hail Discordia!")
			return
		m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE):
				self.kick(nick, "It's spelled Gandhi")
				return
		if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()):
			reply('A facebook link? srsly? Get some self-respect!')
			return
		match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower())
		if match:
			reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1))
			return
		if line == 'moin':
			self.moincount += 1
			if self.moincount == 5:
				reply('moin')
			return
		else:
			self.moincount = 0
		if line.lstrip('.!#').startswith('eta '):
			eta = line[4:].strip()
			with self.db as db:
				db.execute("DELETE FROM etas WHERE nick=?", (nick,))
				if eta:
					db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta))
			dm('ETA registered. Thanks!')
			return
		m = re.findall(URL_REGEX, line.lower())
		for url,*_ in m:
			res = requests.get(url)
			if res.status_code == requests.codes.ok:
				soup = BeautifulSoup(res.text)
				reply(soup.title.string)
		m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if match != 'AfRA' and match != 'afra' and random() < 0.1:
				reply("I'm sure you meant AfRA, not "+match)
				return

開發者ID:Akendo，項目名稱:afrab0t，代碼行數:92，代碼來源:afrab0t.py

示例5: main

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(description="Takes an input feature file \
                                                  and converts it to another \
                                                  format. Formats are \
                                                  determined automatically from\
                                                  file extensions.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('-l', '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from classes and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension == ".tsv":
        example_iter_type = _TSVDictIter
    elif input_extension == ".jsonlines" or input_extension == '.ndj':
        example_iter_type = _JSONDictIter
    elif input_extension == ".libsvm":
        example_iter_type = _LibSVMDictIter
    elif input_extension == ".megam":
        example_iter_type = _MegaMDictIter
    elif input_extension == ".csv":
        example_iter_type = _CSVDictIter
    elif input_extension == ".arff":
        example_iter_type = _ARFFDictIter
    else:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None
#.........這裏部分代碼省略.........

開發者ID:doxav，項目名稱:skll，代碼行數:103，代碼來源:skll_convert.py

示例6: UnicodeDammit

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import split [as 別名]
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n')

    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    # bs4 ne coupe pas toujours bien les différents blocs
    # Mieux vaut donc redécouper par paragraphe et les traiter un à un
    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if( re.search('\w', line) == None ):
            continue

        print('------------------------------ Begin line ------------------------------')
        print(line)
        print('                        ------- End line -------')

        if( is_intro ):
            print()
            answer = input("Is that still part of the intro? (Y/n) ")

            if(answer == 'n' or answer == 'N'):
                is_intro = False
                movie_script.append({

開發者ID:Adrien-Luxey，項目名稱:Da-Fonky-Movie-Script-Parser，代碼行數:33，代碼來源:movie_script_parser.py

注：本文中的bs4.UnicodeDammit.split方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。