本文整理匯總了Python中spacy.en.English方法的典型用法代碼示例。如果您正苦於以下問題:Python en.English方法的具體用法?Python en.English怎麽用?Python en.English使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類spacy.en
的用法示例。
在下文中一共展示了en.English方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: tokenize_text
# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def tokenize_text(text):
"""
Gets tokens from a text in English
"""
if not isinstance(text, unicode):
text = unicode(text)
tokens = [token.lower_ for token in nlp(text)]
return tokens
示例2: to_nlp_objs
# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def to_nlp_objs(sentences):
global nlp_parser
# init once
if (nlp_parser == None):
nlp_parser = English()
nlp_objs = []
for s in sentences:
nlp_objs.append(nlp_parser(s.decode('unicode-escape'), entity=False))
return nlp_objs
示例3: preprocess_data
# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1):
print("Loading SpaCy")
parser = English()
train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
n_labels = len(label_list)
label_dict = dict(zip(range(n_labels), label_list))
fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))
fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))
# save output for David Blei's lda-c code
fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat'))
# save output for Mallet
fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))
# save output for Jacob Eisenstein's SAGE code:
train_sage_output['te_data'] = test_sage_output['tr_data']
train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)
# save output in SVM format
fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
示例4: preprocess_data
# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3):
print("Loading SpaCy")
parser = English()
with codecs.open(train_infile, 'r', encoding='utf-8') as f:
lines = f.readlines()
n_items = len(lines)
n_test = int(test_prop * n_items)
n_train = n_items - n_test
train_indices = np.random.choice(range(n_items), n_train, replace=False)
test_indices = list(set(range(n_items)) - set(train_indices))
train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))
fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
n_labels = len(label_list)
label_dict = dict(zip(range(n_labels), label_list))
fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))
fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))
train_sage_output['te_data'] = test_sage_output['tr_data']
train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)
fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
示例5: main
# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def main():
"""
Creates a "knowledge resource" from triplets file
"""
# Get the arguments
args = docopt("""Parse the Wikipedia dump and create a triplets file, each line is formatted as follows: X\t\Y\tpath
Usage:
parse_wikipedia.py <wiki_file> <vocabulary_file> <out_file>
<wiki_file> = the Wikipedia dump file
<vocabulary_file> = a file containing the words to include
<out_file> = the output file
""")
nlp = English()
wiki_file = args['<wiki_file>']
vocabulary_file = args['<vocabulary_file>']
out_file = args['<out_file>']
# Load the phrase pair files
with codecs.open(vocabulary_file, 'r', 'utf-8') as f_in:
vocabulary = set([line.strip() for line in f_in])
with codecs.open(wiki_file, 'r', 'utf-8') as f_in:
with codecs.open(out_file, 'w', 'utf-8') as f_out:
# Read the next paragraph
for paragraph in f_in:
# Skip empty lines
paragraph = paragraph.strip()
if len(paragraph) == 0:
continue
parsed_par = nlp(unicode(paragraph))
# Parse each sentence separately
for sent in parsed_par.sents:
dependency_paths = parse_sentence(sent, vocabulary)
if len(dependency_paths) > 0:
for (x, y), paths in dependency_paths.iteritems():
for path in paths:
print >> f_out, '\t'.join([x, y, path])
示例6: main
# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3.json')
parser.add_argument('--weights', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_epoch_070.hdf5')
parser.add_argument('--sample_size', type=int, default=25)
parser.add_argument('--caffe', help='path to caffe installation')
parser.add_argument('--model_def', help='path to model definition prototxt')
parser.add_argument('--vggmodel', default='VGG_ILSVRC_16_layers.caffemodel', help='path to model parameters')
args = parser.parse_args()
print 'Loading Word2vec'
nlp = English()
print 'Loaded word2vec features'
labelencoder = joblib.load('../models/labelencoder.pkl')
print 'Loading Model'
model = model_from_json(open(args.model).read())
print 'Loading Weights'
model.load_weights(args.weights)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print 'Loaded'
q = True
while q:
path = str(raw_input('Enter path to image : '))
if path != 'same':
base_dir = os.path.dirname(path)
os.system('python extract_features.py --caffe ' + str(args.caffe) + ' --model_def vgg_features.prototxt --gpu --model ' + str(args.vggmodel) + ' --image ' + path )
print 'Loading VGGfeats'
vgg_model_path = os.path.join(base_dir + '/vgg_feats.mat')
features_struct = scipy.io.loadmat(vgg_model_path)
VGGfeatures = features_struct['feats']
print "Loaded"
question = unicode(raw_input("Ask a question: "))
if question == "quit":
q = False
timesteps = len(nlp(question))
X_q = get_questions_tensor_timeseries([question], nlp, timesteps)
X_i = np.reshape(VGGfeatures, (1, 4096))
X = [X_q, X_i]
y_predict = model.predict_classes(X, verbose=0)
print labelencoder.inverse_transform(y_predict)
示例7: main
# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def main():
'''
Before runnning this demo ensure that you have some images from the MS COCO validation set
saved somewhere, and update the image_dir variable accordingly
Also, this demo is designed to run with the models released with the visual-qa repo, if you
would like to get use it with some other model (say an MLP based model or a langauge-only model)
you will have to make some changes.
'''
image_dir = '../../vqa_images/'
local_images = [ f for f in listdir(image_dir) if isfile(join(image_dir,f)) ]
parser = argparse.ArgumentParser()
parser.add_argument('-model', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3.json')
parser.add_argument('-weights', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_epoch_070.hdf5')
parser.add_argument('-sample_size', type=int, default=25)
args = parser.parse_args()
model = model_from_json(open(args.model).read())
model.load_weights(args.weights)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print 'Model loaded and compiled'
images_val = open('../data/preprocessed/images_val2014.txt',
'r').read().decode('utf8').splitlines()
nlp = English()
print 'Loaded word2vec features'
labelencoder = joblib.load('../models/labelencoder.pkl')
vgg_model_path = '../features/coco/vgg_feats.mat'
features_struct = scipy.io.loadmat(vgg_model_path)
VGGfeatures = features_struct['feats']
print 'Loaded vgg features'
image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines()
img_map = {}
for ids in image_ids:
id_split = ids.split()
img_map[id_split[0]] = int(id_split[1])
image_sample = random.sample(local_images, args.sample_size)
for image in image_sample:
p = subprocess.Popen(["display", image_dir + image])
q = unicode(raw_input("Ask a question about the image:"))
coco_id = str(int(image[-16:-4]))
timesteps = len(nlp(q)) #questions sorted in descending order of length
X_q = get_questions_tensor_timeseries([q], nlp, timesteps)
X_i = get_images_matrix([coco_id], img_map, VGGfeatures)
X = [X_q, X_i]
y_predict = model.predict_classes(X, verbose=0)
print labelencoder.inverse_transform(y_predict)
raw_input('Press enter to continue...')
p.kill()