本文整理汇总了Python中nltk.data方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.data方法的具体用法?Python nltk.data怎么用?Python nltk.data使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.data方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sql_query
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
示例2: val_dump
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def val_dump(rels, db):
"""
Make a ``Valuation`` from a list of relation metadata bundles and dump to
persistent database.
:param rels: bundle of metadata needed for constructing a concept
:type rels: list of dict
:param db: name of file to which data is written.
The suffix '.db' will be automatically appended.
:type db: string
"""
concepts = process_bundle(rels).values()
valuation = make_valuation(concepts, read=True)
db_out = shelve.open(db, 'n')
db_out.update(valuation)
db_out.close()
示例3: add_full_stops_to_the_end
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def add_full_stops_to_the_end(infile, outfile):
#clean data of small titles nad add full stops for NLTK to work
output_format = '{}.\n'.format
with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
for line in fin:
if line[0] == ' ':
pass
#ignore headlines with less than three words
elif len(line.split()) <= 3:
pass
elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
print >> fout, line.decode('utf-8'),
else:
print >> fout, output_format(line.strip()).decode('utf-8'),
############################################
# Convert All except first word and quotes
# to lower case #
############################################
示例4: clean_raw_text
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def clean_raw_text(text, file_name=''):
'''
cleans all text input and places the cleaned text in the 'samples' folder, one line at the time (as required by semaphore).
'''
import re
import nltk, nltk.data
sent_detector=nltk.data.load('tokenizers/punkt/english.pickle')
raw_text=text
clean_file=file_name if file_name else 'clean_text.txt'
text=re.sub(r'-+(\n)\s*', '', raw_text)
text=re.sub(r'(\n)+', '', text)
text= '\n'.join([' '.join(nltk.word_tokenize(sent)) for sent in sent_detector.tokenize(text.strip())])
open(clean_file, 'w').write(text)
示例5: semaphore
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def semaphore(text='', files='', semaphore=release):
'''
This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
sentence (the text can be completely raw).
'''
os.chdir(semaphore)
if text:
sample='../samples/cleaned.txt'
if files:
text=text+' '.join([open(f, 'r').read() for f in files])
#I just name the newly cleaned file by the name of the first file in the file list + "_clean":
sample='../samples/' + files[0].split('/')[-1][:-4] + '_clean.txt'
if text:
clean_raw_text(text, file_name=sample)
else:
sample='../samples/sample.txt'
run_semaphore(release=semaphore, sample=sample)
return import_semaphore()
示例6: mysemaphore
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def mysemaphore(text, path, semaphore=release):
import shutil
'''
This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
sentence (the text can be completely raw).
'''
os.chdir(semaphore)
sample='../samples/cleaned.txt'
open(sample, 'w').write(text)
run_semaphore(release=semaphore, sample=sample)
shutil.copy (semaphore+'/../samples/output.txt',path)
return import_semaphore()
示例7: load_data_and_labels
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def load_data_and_labels():
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
negative_examples = [s.strip() for s in negative_examples]
# Split by words
#x_text = list(open("./trainUNK.txt", "r").readlines())
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
x_text = [s.split(" ") for s in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return [x_text, y]
示例8: load_data_for_books
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def load_data_for_books(path):
text = ''.join(open(path).readlines()).decode('utf8')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
book = tokenizer.tokenize(text)
#book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
#book = list(open(path, "r").readlines())
book = [s.strip() for s in book]
book = [clean_str(sent) for sent in book]
book = [s.split(" ") for s in book]
x_text = book
y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
sentences, labels = x_text,y
sentences_padded = pad_sentences(sentences)
sentencesT, labelsT = load_data_and_labels()
sentences_paddedT = pad_sentences(sentencesT)
vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabulary_inv, sentencesT]
示例9: batch_iter
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def batch_iter(data, batch_size, num_epochs, shuffle=True):
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
示例10: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def __init__(self, language="en", punkt_data_path=None):
self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"}
self.log = log.get_global_console_logger()
try:
import nltk.data
except ImportError:
self.log.error(
"Cannot import NLTK data for the sentence splitter. Please "
"check if the 'punkt' NLTK-package is installed correctly.")
try:
if not punkt_data_path:
punkt_data_path = self.lang2datapath[language]
self.sent_detector = nltk.data.load(punkt_data_path)
except KeyError:
self.log.error(
"No sentence splitter data for language {}.".format(language))
except:
self.log.error(
"Could not load sentence splitter data: {}".format(
self.lang2datapath[language]))
示例11: sql_query
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn(
"Make sure the database file %s is installed and uncompressed." % dbname
)
raise
示例12: data
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def data(self):
for name in self.names:
f = nltk.data.find(name)
with f.open() as fp:
file_data = fp.read().decode('utf8')
yield f, file_data
示例13: test_correct_values
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def test_correct_values(self):
# Check that corpus views produce the correct sequence of values.
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(list(v), file_data.split())
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(list(v), self.linetok.tokenize(file_data))
示例14: test_correct_length
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def test_correct_length(self):
# Check that the corpus views report the correct lengths:
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(len(v), len(file_data.split()))
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
示例15: fcfg_demo
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def fcfg_demo():
import nltk.data
g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
print(g)
print()