當前位置: 首頁>>代碼示例>>Python>>正文


Python en.English方法代碼示例

本文整理匯總了Python中spacy.en.English方法的典型用法代碼示例。如果您正苦於以下問題:Python en.English方法的具體用法?Python en.English怎麽用?Python en.English使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在spacy.en的用法示例。


在下文中一共展示了en.English方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: tokenize_text

# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def tokenize_text(text):
    """
    Gets tokens from a text in English
    """
    if not isinstance(text, unicode):
        text = unicode(text)

    tokens = [token.lower_ for token in nlp(text)]

    return tokens 
開發者ID:textclf,項目名稱:fancy-cnn,代碼行數:12,代碼來源:language.py

示例2: to_nlp_objs

# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def to_nlp_objs(sentences):
    global nlp_parser
    # init once
    if (nlp_parser == None):
        nlp_parser = English()

    nlp_objs = []
    for s in sentences:
        nlp_objs.append(nlp_parser(s.decode('unicode-escape'), entity=False))
    return nlp_objs 
開發者ID:CatalystCode,項目名稱:corpus-to-graph-ml,代碼行數:12,代碼來源:features_generation_tools.py

示例3: preprocess_data

# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1):

    print("Loading SpaCy")
    parser = English()
    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))

    # save output for David Blei's lda-c code
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat'))

    # save output for Mallet
    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    # save output for Jacob Eisenstein's SAGE code:
    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    # save output in SVM format
    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt')) 
開發者ID:dallascard,項目名稱:neural_topic_models,代碼行數:36,代碼來源:preprocess_data.py

示例4: preprocess_data

# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3):

    print("Loading SpaCy")
    parser = English()

    with codecs.open(train_infile, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    n_items = len(lines)
    n_test = int(test_prop * n_items)
    n_train = n_items - n_test
    train_indices = np.random.choice(range(n_items), n_train, replace=False)
    test_indices = list(set(range(n_items)) - set(train_indices))

    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))

    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt')) 
開發者ID:dallascard,項目名稱:neural_topic_models,代碼行數:39,代碼來源:preprocess_nips.py

示例5: main

# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def main():
    """
    Creates a "knowledge resource" from triplets file
    """

    # Get the arguments
    args = docopt("""Parse the Wikipedia dump and create a triplets file, each line is formatted as follows: X\t\Y\tpath

    Usage:
        parse_wikipedia.py <wiki_file> <vocabulary_file> <out_file>

        <wiki_file> = the Wikipedia dump file
        <vocabulary_file> = a file containing the words to include
        <out_file> = the output file
    """)

    nlp = English()

    wiki_file = args['<wiki_file>']
    vocabulary_file = args['<vocabulary_file>']
    out_file = args['<out_file>']

    # Load the phrase pair files
    with codecs.open(vocabulary_file, 'r', 'utf-8') as f_in:
        vocabulary = set([line.strip() for line in f_in])

    with codecs.open(wiki_file, 'r', 'utf-8') as f_in:
        with codecs.open(out_file, 'w', 'utf-8') as f_out:

            # Read the next paragraph
            for paragraph in f_in:

                # Skip empty lines
                paragraph = paragraph.strip()
                if len(paragraph) == 0:
                    continue

                parsed_par = nlp(unicode(paragraph))

                # Parse each sentence separately
                for sent in parsed_par.sents:
                    dependency_paths = parse_sentence(sent, vocabulary)
                    if len(dependency_paths) > 0:
                        for (x, y), paths in dependency_paths.iteritems():
                            for path in paths:
                                print >> f_out, '\t'.join([x, y, path]) 
開發者ID:morningmoni,項目名稱:TaxoRL,代碼行數:48,代碼來源:parse_wikipedia.py

示例6: main

# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def main():

   parser = argparse.ArgumentParser()
   parser.add_argument('--model', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3.json')
   parser.add_argument('--weights', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_epoch_070.hdf5')
   parser.add_argument('--sample_size', type=int, default=25)
   parser.add_argument('--caffe', help='path to caffe installation')
   parser.add_argument('--model_def', help='path to model definition prototxt')
   parser.add_argument('--vggmodel', default='VGG_ILSVRC_16_layers.caffemodel', help='path to model parameters')
   args = parser.parse_args()
   print 'Loading Word2vec'
   nlp = English()
   print 'Loaded word2vec features'
   labelencoder = joblib.load('../models/labelencoder.pkl')
   print 'Loading Model'
   model = model_from_json(open(args.model).read())
   print 'Loading Weights'
   model.load_weights(args.weights)
   model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
   print 'Loaded'
   q = True

   while q:

       path = str(raw_input('Enter path to image : '))
       if path != 'same':
           base_dir = os.path.dirname(path)
           os.system('python extract_features.py --caffe ' + str(args.caffe) + ' --model_def vgg_features.prototxt --gpu --model ' + str(args.vggmodel) + ' --image ' + path )
       print 'Loading VGGfeats'
       vgg_model_path = os.path.join(base_dir + '/vgg_feats.mat')
       features_struct = scipy.io.loadmat(vgg_model_path)
       VGGfeatures = features_struct['feats']
       print "Loaded"

       question = unicode(raw_input("Ask a question: "))
       if question == "quit":
           q = False
       timesteps = len(nlp(question))
       X_q = get_questions_tensor_timeseries([question], nlp, timesteps)
       X_i = np.reshape(VGGfeatures, (1, 4096))

       X = [X_q, X_i]

       y_predict = model.predict_classes(X, verbose=0)
       print labelencoder.inverse_transform(y_predict) 
開發者ID:avisingh599,項目名稱:visual-qa,代碼行數:47,代碼來源:own_image.py

示例7: main

# 需要導入模塊: from spacy import en [as 別名]
# 或者: from spacy.en import English [as 別名]
def main():
	'''
	Before runnning this demo ensure that you have some images from the MS COCO validation set
	saved somewhere, and update the image_dir variable accordingly
	Also, this demo is designed to run with the models released with the visual-qa repo, if you
	would like to get use it with some other model (say an MLP based model or a langauge-only model)
	you will have to make some changes.
	'''
	image_dir = '../../vqa_images/'
	local_images = [ f for f in listdir(image_dir) if isfile(join(image_dir,f)) ]	
	
	parser = argparse.ArgumentParser()
	parser.add_argument('-model', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3.json')
	parser.add_argument('-weights', type=str, default='../models/lstm_1_num_hidden_units_lstm_512_num_hidden_units_mlp_1024_num_hidden_layers_mlp_3_epoch_070.hdf5')
	parser.add_argument('-sample_size', type=int, default=25)
	args = parser.parse_args()
	
	model = model_from_json(open(args.model).read())
	model.load_weights(args.weights)
	model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
	print 'Model loaded and compiled'
	images_val = open('../data/preprocessed/images_val2014.txt', 
						'r').read().decode('utf8').splitlines()

	nlp = English()
	print 'Loaded word2vec features'
	labelencoder = joblib.load('../models/labelencoder.pkl')

	vgg_model_path = '../features/coco/vgg_feats.mat'
	features_struct = scipy.io.loadmat(vgg_model_path)
	VGGfeatures = features_struct['feats']
	print 'Loaded vgg features'
	image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines()
	img_map = {}
	for ids in image_ids:
		id_split = ids.split()
		img_map[id_split[0]] = int(id_split[1])

	image_sample = random.sample(local_images, args.sample_size)

	for image in image_sample:
		p = subprocess.Popen(["display", image_dir + image])
		q = unicode(raw_input("Ask a question about the image:"))	
		coco_id = str(int(image[-16:-4]))
		timesteps = len(nlp(q)) #questions sorted in descending order of length
		X_q = get_questions_tensor_timeseries([q], nlp, timesteps)
		X_i = get_images_matrix([coco_id], img_map, VGGfeatures)
		X = [X_q, X_i]
		y_predict = model.predict_classes(X, verbose=0)
		print labelencoder.inverse_transform(y_predict)
		raw_input('Press enter to continue...')
		p.kill() 
開發者ID:avisingh599,項目名稱:visual-qa,代碼行數:54,代碼來源:demo_batch.py


注:本文中的spacy.en.English方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。