本文整理汇总了Python中scanner.Scanner.from_file方法的典型用法代码示例。如果您正苦于以下问题:Python Scanner.from_file方法的具体用法?Python Scanner.from_file怎么用?Python Scanner.from_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scanner.Scanner
的用法示例。
在下文中一共展示了Scanner.from_file方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from scanner import Scanner [as 别名]
# 或者: from scanner.Scanner import from_file [as 别名]
def main(args):
if args.temp:
buckets_dir = args.temp
else:
buckets_dir = os.path.join(args.model, 'buckets')
makedir(buckets_dir)
bucketlist_path = os.path.join(args.model, 'bucketlist')
index_path = os.path.join(args.model, 'paths')
# display paths
logger.info("index path: %s", index_path)
logger.info("bucketlist path: %s", bucketlist_path)
logger.info("buckets path: %s", buckets_dir)
with open(index_path) as f:
reader = csv.reader(f)
items = list(reader)
# Tokenize
logger.info("will tokenize %d files" % len(items))
if args.scanner:
from scanner import Scanner
tokenizer = Scanner.from_file(args.scanner)
logger.info("using provided scanner: ", args.scanner)
elif args.prager:
tokenizer = PragerTokenizer(args.order, use_words=args.words)
logger.info("using Prager tokenization: order[{0}] use_words[{1}]".format(args.order, args.words))
else:
tokenizer = NGramTokenizer(args.min_order,args.max_order)
logger.info("using n-gram tokenizer: order {0}-{1}".format(args.min_order, args.max_order))
b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size)
# output the paths to the buckets
with open(bucketlist_path,'w') as f:
for d in b_dirs:
f.write(d+'\n')
示例2: open
# 需要导入模块: from scanner import Scanner [as 别名]
# 或者: from scanner.Scanner import from_file [as 别名]
print "index path:", index_path
print "bucketlist path:", bucketlist_path
print "buckets path:", buckets_dir
with open(index_path) as f:
reader = csv.reader(f)
items = list(reader)
if sum(map(bool,(args.scanner, args.max_order, args.word))) > 1:
parser.error('can only specify one of --word, --scanner and --max_order')
# Tokenize
print "will tokenize %d files" % len(items)
if args.scanner:
from scanner import Scanner
tokenizer = Scanner.from_file(args.scanner)
print "using provided scanner: ", args.scanner
elif args.word:
tokenizer = str.split
print "using str.split to tokenize"
else:
max_order = args.max_order if args.max_order else MAX_NGRAM_ORDER
tokenizer = NGramTokenizer(1,max_order)
print "using n-gram tokenizer: max_order({0})".format(max_order)
b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize)
# output the paths to the buckets
with open(bucketlist_path,'w') as f:
for d in b_dirs:
f.write(d+'\n')