本文整理汇总了Python中tagger.Tagger类的典型用法代码示例。如果您正苦于以下问题:Python Tagger类的具体用法?Python Tagger怎么用?Python Tagger使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Tagger类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main_tag
def main_tag(featureSet, options):
labelCounter, featCounter = BookKeeper(), BookKeeper()
labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName))
featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName))
optionsDict = vars(options)
optionsDict['labelCounter'] = labelCounter
optionsDict['featCounter'] = featCounter
optionsDict['modelFile'] = '{0}.model'.format(options.modelName)
tagger = Tagger(featureSet, optionsDict)
if options.inFeatFile:
tagger_func = lambda: tagger.tag_features(options.inFeatFile)
writer_func = lambda s, c: writeSentence(s, comment=c)
elif options.input_dir:
assert isdir(options.input_dir), "--input-dir must be a directory"
out_dir = "{}_out".format(options.input_dir)
os.mkdir(out_dir)
tagger_func = lambda: tagger.tag_dir(options.input_dir)
writer_func = lambda s, c: writeSentence(
s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a'))
else:
tagger_func = lambda: tagger.tag_corp(sys.stdin)
writer_func = lambda s, c: writeSentence(s, comment=c)
for sen, other in tagger_func():
writer_func(sen, other)
示例2: genKeyWords
def genKeyWords(self, question):
questionToken = self.preProcess(question)
tagger = Tagger('portugues')
token = tagger.classify(questionToken)
keyList = []
for tok in token:
if tok[1] == 'N' or re.match('ADJ', tok[1]) or re.match('V', tok[1]):
keyList.append(tok)
print keyList
print len(keyList)
return keyList
示例3: mainTag
def mainTag(featureSet, options):
transModel = None
if not (options['printWeights'] or options['toCRFsuite']):
print('loading transition model...', end='', file=sys.stderr, flush=True)
transModel = TransModel.getModelFromFile(options['transModelFileName'])
print('done', file=sys.stderr, flush=True)
tagger = Tagger(featureSet, transModel, options)
if 'inFeatFile' in options and options['inFeatFile']:
# Tag a featurized file to to outputStream
for sen, comment in tagger.tagFeatures(options['inFeatFile']):
writeSentence(sen, options['outputStream'], comment)
elif 'ioDirs' in options and options['ioDirs']:
# Tag all files in a directory file to to fileName.tagged
for sen, fileName in tagger.tagDir(options['ioDirs'][0]):
writeSentence(sen, open(join(options['ioDirs'][1], '{0}.tagged'.format(fileName)), 'a', encoding='UTF-8'))
elif 'toCRFsuite' in options and options['toCRFsuite']:
# Make CRFsuite format to outputStream for tagging
tagger.toCRFsuite(options['inputStream'], options['outputStream'])
elif 'printWeights' in options and options['printWeights']:
# Print MaxEnt weights to STDOUT
tagger.printWeights(options['printWeights'], options['outputStream'])
else:
# Tag inputStream to outputStream
for sen, comment in tagger.tagCorp(options['inputStream']):
writeSentence(sen, options['outputStream'], comment)
示例4: gen_opt
def gen_opt(self, file_text):
'''
método que gera o novo texto.
cada palavra é classificada e concatenada com o tipo através do caractere /
depois, é concatenado com o retorno do synset.
'''
tagger = Tagger('portugues')
tok = word_tokenize(file_text.read().decode('utf-8'))
clas = tagger.classify(tok)
p_text = []
for c in clas:
if c[1] == 'N' or re.match('ADJ',c[1]) or re.match('V',c[1]) or c[1] == '.':
gen_set = self.gen_synset(c)
p_text.append(gen_set)
optimized_text = ' '.join(p_text)
return optimized_text
示例5: __init__
class SearchEngine:
"""
classe que considero a principal desse modulo. É a estrutura de dados que
contém os arquivos para o processamento da engine.
"""
def __init__(self):
self.tagger = Tagger('portugues')
def insert(self, files):
"""
Esse método tem como entrada um array de arquivos, retornando uma lista de indexação reversa.
"""
dataset = []
for f in files:
paragraph = sent_tokenize(f[1].lower())
for index, p in enumerate(paragraph):
words = word_tokenize(p)
classes = self.tagger.classify(words)
for c in classes:
if re.match('N', c[1]):
keysId = [item['_id'] for item in dataset]
print 'qtd chaves: ' + str(len(keysId))
if c[0] in keysId:
ind = keysId.index(c[0])
files = dataset[ind]
if os.path.basename(f[0]) in files.keys():
if not index in dataset[ind][os.path.basename(f[0])]:
dataset[ind][os.path.basename(f[0])].append(index)
else:
dataset[ind][os.path.basename(f[0])] = [index]
else:
dataset.append({'_id':c[0], os.path.basename(f[0]):[index]})
return dataset
def extract(self, data):
"""
Algoritmo de busca simples que retorna um tupla com os arquivos.
"""
print 'vim pelo método extract'
for d in data:
paragraphs = []
try:
d.pop('_id')
except KeyError:
'ok'
for k in d.keys(): # o [1:] para eliminar a primeira chave!
#path_name = os.path.abspath('backend')
path_name = os.path.abspath('backend') + '/texts/'
#text = open(path_name+'\\texts\\'+ k + '.txt').read().decode('utf-8')
# print path_name+'\\'+ k + '.txt'
text = open(path_name+ k + '.txt').read().decode('utf-8')
text_sent = sent_tokenize(text)
for index in d[k]:
paragraphs.append(text_sent[index])
print paragraphs
return set(paragraphs)
示例6: pos_tag
def pos_tag(tweets):
"""
Uses the POS tagger interface to tag part-of-speech in all the tweets texts, stores it as dict in the tweet objects.
"""
print "Tagging..."
untagged_texts = []
for tweet in tweets:
tagger = Tagger()
textbody = tweet.text
for phrase in re.split("\.|!|\?", textbody):
if len(phrase)<2: continue
phrase = string.replace(phrase, "?", "")
phrase = string.replace(phrase, "!", "")
phrase = string.replace(phrase, ".", "")
tags = tagger.tag_text(phrase)
if tags!=None:
tweet.tagged_words.append(tags)
print "Untagged texts: "
for text in untagged_texts:
print text
print "Tagging done."
return tweets
示例7: __init__
def __init__(self, p):
self.save_dir, _ = generate_directory(p.save_to)
self.p = p
print_p(self.p)
self.tagger = Tagger.create_tagger(self.p)
if 'load_from' in self.p and (self.p.load_from is not None):
self.load_model(self.p.load_from)
logger.info('Setting up data...')
self.streams = setup_data(self.p, use_unlabeled=True, use_labeled=True)
示例8: main
def main(training_file, training_dir, load_model, skip_train):
logging.debug('Initializing random seed to 0.')
random.seed(0)
np.random.seed(0)
if load_model:
tagger = Tagger.load(load_model)
data = TaggingDataset.load_from_file(training_file, vocab=tagger.vocab, tags=tagger.tags)
else:
assert not skip_train, 'Cannot --skip_train without a saved model.'
logging.debug('Loading dataset from: %s' % training_file)
data = TaggingDataset.load_from_file(training_file)
logging.debug('Initializing model.')
tagger = Tagger(data.vocab, data.tags)
if not skip_train:
train_data, dev_data = data.split(0.7)
batches_train = train_data.prepare_batches(n_seqs_per_batch=10)
batches_dev = dev_data.prepare_batches(n_seqs_per_batch=100)
train_mgr = TrainingManager(
avg_n_losses=len(batches_train),
training_dir=training_dir,
tagger_taste_fn=lambda: taste_tagger(tagger, batches_train),
tagger_dev_eval_fn=lambda: eval_tagger(tagger, batches_dev),
tagger_save_fn=lambda fname: tagger.save(fname)
)
logging.debug('Starting training.')
while train_mgr.should_continue():
mb_x, mb_y = random.choice(batches_train)
mb_loss = tagger.learn(mb_x, mb_y)
train_mgr.tick(mb_loss=mb_loss)
evaluate_tagger_and_writeout(tagger)
示例9: generation
def generation(self):
self.tokenized = [nltk.word_tokenize(self.sentences[i]) for i in range(len(self.sentences))]
self.generate_average_position()
self.types = {}
tagger = Tagger(False)
for i in range(len(self.tokenized)):
typess = tagger.tag_sent(self.tokenized[i])
for j in range(len(typess)):
word,val = typess[j]
if(not word in self.types):
self.types[word] = []
self.types[word].append(val)
else:
self.types[word].append(val)
for element in self.types:
most_common,num_most_common = Counter(self.types[element]).most_common(1)[0]
self.types[element] =most_common
num_sent = 1
for sent in self.tokenized:
actual = sent
num_word = 1
last = None
for mot in actual:
actual = None
if(not self.isWordIn(mot)):
tmp = Etiquette(mot,num_sent,num_word)
self.nodes.append(tmp)
actual = tmp
else:
actual = self.get_node_with_value(mot)
actual.add_sid_pid(num_sent,num_word)
if(num_word>1):
last.add_next(actual.get_id())
last = actual
num_word +=1
num_sent +=1
示例10: testExpressionValidatorPlusAndAsterisk
def testExpressionValidatorPlusAndAsterisk(self):
with self.assertRaises(SystemExit) as exit_val:
Tagger.validateExpressions(['a+*b'])
self.assertEqual(exit_val.exception.code, 4)
示例11: testExpressionValidatorNegationAndNegation
def testExpressionValidatorNegationAndNegation(self):
with self.assertRaises(SystemExit) as exit_val:
Tagger.validateExpressions(['a!!b'])
self.assertEqual(exit_val.exception.code, 4)
示例12: testExpressionValidatorTwoDots
def testExpressionValidatorTwoDots(self):
with self.assertRaises(SystemExit) as exit_val:
Tagger.validateExpressions(['a..b'])
self.assertEqual(exit_val.exception.code, 4)
示例13: testReformatMultipleQuantifiers
def testReformatMultipleQuantifiers(self):
self.assertEqual(Tagger.bonusReformatMutlipleQuantifiers('sub+++'), 'sub+')
示例14: testQuantifierPriorityExclamationAndDot
def testQuantifierPriorityExclamationAndDot(self):
self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('..!!..'), '!')
示例15: testQuantifierPriorityExclamationAndPlus
def testQuantifierPriorityExclamationAndPlus(self):
self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('+++!++'), '!')