Python nltk.data方法代码示例

本文整理汇总了Python中nltk.data方法的典型用法代码示例。如果您正苦于以下问题：Python nltk.data方法的具体用法？Python nltk.data怎么用？Python nltk.data使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk的用法示例。

在下文中一共展示了nltk.data方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: sql_query

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:20，代码来源:chat80.py

示例2: val_dump

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def val_dump(rels, db):
    """
    Make a ``Valuation`` from a list of relation metadata bundles and dump to
    persistent database.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :param db: name of file to which data is written.
               The suffix '.db' will be automatically appended.
    :type db: string
    """
    concepts = process_bundle(rels).values()
    valuation = make_valuation(concepts, read=True)
    db_out = shelve.open(db, 'n')

    db_out.update(valuation)

    db_out.close()

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:20，代码来源:chat80.py

示例3: add_full_stops_to_the_end

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def add_full_stops_to_the_end(infile, outfile):
	#clean data of small titles nad add full stops for NLTK to work
	output_format = '{}.\n'.format
	with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
		for line in fin:
			if line[0] == ' ':
				pass
			#ignore headlines with less than three words
			elif len(line.split()) <= 3:
				pass
			elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
				print >> fout, line.decode('utf-8'),
			else:
				print >> fout, output_format(line.strip()).decode('utf-8'),



############################################
#   Convert All except first word and quotes
# 	to lower case 				           #
############################################

开发者ID:bhargaviparanjape，项目名称:clickbait，代码行数:23，代码来源:experiments.py

示例4: clean_raw_text

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def clean_raw_text(text, file_name=''):

    '''
    cleans all text input and places the cleaned text in the 'samples' folder, one line at the time (as required by semaphore).
    '''

    import re
    import nltk, nltk.data

    sent_detector=nltk.data.load('tokenizers/punkt/english.pickle')

    raw_text=text
    clean_file=file_name if file_name else 'clean_text.txt'

    text=re.sub(r'-+(\n)\s*', '', raw_text)
    text=re.sub(r'(\n)+', '', text)

    text= '\n'.join([' '.join(nltk.word_tokenize(sent)) for sent in sent_detector.tokenize(text.strip())])
    open(clean_file, 'w').write(text)

开发者ID:pedrobalage，项目名称:SemevalAspectBasedSentimentAnalysis，代码行数:21，代码来源:semaphore.py

示例5: semaphore

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def semaphore(text='', files='', semaphore=release):
    '''
    This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
    sentence (the text can be completely raw).
    '''
    os.chdir(semaphore)

    if text:
        sample='../samples/cleaned.txt'
    if files:
        text=text+' '.join([open(f, 'r').read() for f in files])
        #I just name the newly cleaned file by the name of the first file in the file list + "_clean":
        sample='../samples/' + files[0].split('/')[-1][:-4] + '_clean.txt'

    if text:
        clean_raw_text(text, file_name=sample)

    else:
        sample='../samples/sample.txt'

    run_semaphore(release=semaphore, sample=sample)

    return import_semaphore()

开发者ID:pedrobalage，项目名称:SemevalAspectBasedSentimentAnalysis，代码行数:25，代码来源:semaphore.py

示例6: mysemaphore

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def mysemaphore(text, path, semaphore=release):
    import shutil

    '''
    This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
    sentence (the text can be completely raw).
    '''
    os.chdir(semaphore)

    sample='../samples/cleaned.txt'

    open(sample, 'w').write(text)

    run_semaphore(release=semaphore, sample=sample)

    shutil.copy (semaphore+'/../samples/output.txt',path)

    return import_semaphore()

开发者ID:pedrobalage，项目名称:SemevalAspectBasedSentimentAnalysis，代码行数:20，代码来源:semaphore.py

示例7: load_data_and_labels

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    #x_text = list(open("./trainUNK.txt", "r").readlines())
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

开发者ID:awjuliani，项目名称:DNN-Sentiment，代码行数:22，代码来源:data_helpers.py

示例8: load_data_for_books

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def load_data_for_books(path):
	text = ''.join(open(path).readlines()).decode('utf8')
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	book = tokenizer.tokenize(text)
	#book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
	#book = list(open(path, "r").readlines())
	book = [s.strip() for s in book]
	book = [clean_str(sent) for sent in book]
	book = [s.split(" ") for s in book]
	x_text = book
	y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
	sentences, labels = x_text,y
	sentences_padded = pad_sentences(sentences)
    
    
    
	sentencesT, labelsT = load_data_and_labels()
	sentences_paddedT = pad_sentences(sentencesT)
	vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
	x, y = build_input_data(sentences_padded, labels, vocabulary)
	return [x, y, vocabulary, vocabulary_inv, sentencesT]

开发者ID:awjuliani，项目名称:DNN-Sentiment，代码行数:23，代码来源:data_helpers.py

示例9: batch_iter

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

开发者ID:awjuliani，项目名称:DNN-Sentiment，代码行数:20，代码来源:data_helpers.py

示例10: init

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def __init__(self, language="en", punkt_data_path=None):
        self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"}
        self.log = log.get_global_console_logger()
        try:
            import nltk.data
        except ImportError:
            self.log.error(
                "Cannot import NLTK data for the sentence splitter. Please "
                "check if the 'punkt' NLTK-package is installed correctly.")
        try:
            if not punkt_data_path:
                punkt_data_path = self.lang2datapath[language]
            self.sent_detector = nltk.data.load(punkt_data_path)
        except KeyError:
            self.log.error(
                "No sentence splitter data for language {}.".format(language))
        except:
            self.log.error(
                "Could not load sentence splitter data: {}".format(
                    self.lang2datapath[language]))

开发者ID:bheinzerling，项目名称:pyrouge，代码行数:22，代码来源:sentence_splitter.py

示例11: sql_query

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3

    try:
        path = nltk.data.find(dbname)
        connection = sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings

        warnings.warn(
            "Make sure the database file %s is installed and uncompressed." % dbname
        )
        raise

开发者ID:V1EngineeringInc，项目名称:V1EngineeringInc-Docs，代码行数:24，代码来源:chat80.py

示例12: data

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:8，代码来源:test_corpus_views.py

示例13: test_correct_values

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data))

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:11，代码来源:test_corpus_views.py

示例14: test_correct_length

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import data [as 别名]
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))