当前位置: 首页>>代码示例>>Python>>正文


Python nltk.sent_tokenize方法代码示例

本文整理汇总了Python中nltk.sent_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.sent_tokenize方法的具体用法?Python nltk.sent_tokenize怎么用?Python nltk.sent_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk的用法示例。


在下文中一共展示了nltk.sent_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: prepareSents

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def prepareSents(wrds):
    valid_sents=[]
    text=''.join(wrd[0] for wrd in wrds)
    sent_list=[[(word,0,'None') for word in sent] for sent in sent_tokenize(text)]
    text=[word for word in wrds if word[0]!=' ']
    sent_list=[[word for word in concat_words(strip_chars(sent)) if word[0]!=' '] for sent in sent_list]
    idx=0
    s_idx=0
    while idx < len(text) and s_idx<len(sent_list):
        if not match_words(sent_list[s_idx],text[idx:idx+len(sent_list[s_idx])]):
            print "NLTK:"+ str(sent_list[s_idx])
            print 'MINE:' + str(text[idx:idx+len(sent_list[s_idx])])
        else:
            valid_sents+=[text[idx:idx+len(sent_list[s_idx])]]
        idx=idx+len(sent_list[s_idx])
        s_idx+=1
    return valid_sents 
开发者ID:abhyudaynj,项目名称:LSTM-CRF-models,代码行数:19,代码来源:extract_data.py

示例2: get_story_question_answer_triples

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples 
开发者ID:sujitpal,项目名称:dl-models-for-qa,代码行数:20,代码来源:kaggle.py

示例3: parse_sents

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def parse_sents(self, sents):
        """
        Parse multiple sentences

        If "sents" is a string, it will be segmented into sentences using NLTK.
        Otherwise, each element of "sents" will be treated as a sentence.

        sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse

        Returns: Iter[nltk.Tree]
        """
        if isinstance(sents, STRING_TYPES):
            if self._tokenizer_lang is None:
                raise ValueError(
                    "No tokenizer available for this language. "
                    "Please split into individual sentences and tokens "
                    "before calling the parser."
                    )
            sents = nltk.sent_tokenize(sents, self._tokenizer_lang)

        for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)):
            yield self._make_nltk_tree(sentence, tags_raw, *parse_raw) 
开发者ID:nikitakit,项目名称:self-attentive-parser,代码行数:24,代码来源:nltk_plugin.py

示例4: preprocess

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e))) 
开发者ID:DistrictDataLabs,项目名称:partisan-discourse,代码行数:18,代码来源:nlp.py

示例5: extract_nnp_phrases

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def extract_nnp_phrases(text):
    """
    NNP extractor convenience method.
    :param text:
    :return:
    """
    phrase_list = []

    for sentence in nltk.sent_tokenize(text):
        # Get POS
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        # Get POS
        phrase = []

        for t, p in pos:
            if p in ["NNP", "NNPS"] or t in [",", "&"]:
                phrase.append(t)
            else:
                if len(phrase) > 1:
                    phrase_list.append(clean_nnp_phrase(phrase))
                phrase = []

    return phrase_list 
开发者ID:LexPredict,项目名称:lexpredict-contraxsuite,代码行数:27,代码来源:custom.py

示例6: clean_thread_conversations

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def clean_thread_conversations(sub_str):
    conversations = []
    for mon in ['07', '08', '09', '10', '11', '12']:
        with open('datasets/raw_reddit/reddit_{}_{}_18threads.json'.format(sub_str, mon)) as f:
            data = json.load(f)

        for thread in data:
            new_convo = {}
            new_convo['lines'] = []
            speaker = 0
            for msg in thread:
                text = clean_post(msg['text'])
                if len(text) > 1:
                    sentences = nltk.sent_tokenize(text)
                    for sent in sentences:
                        sent_dict = {}
                        sent_dict['character'] = speaker
                        sent_dict['text'] = sent
                        new_convo['lines'].append(sent_dict)
                    speaker = 1 - speaker
            if len(new_convo['lines']) > 1:
                conversations.append(new_convo)
    return conversations 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:25,代码来源:reddit_utils.py

示例7: sentence_splitting

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def sentence_splitting (texts, slen = 1):
	if len(texts) <= 0:
		return []
	
	# splitting
	sentences = []
	text_sents = sent_tokenize(texts)
	if (text_sents != [''] and len(text_sents) >  0):
		for sent in text_sents:
			sent = sent.strip().split('\r') # split strings that contains "\r"
			for sen in sent:
				se = sen.split('. ')
				for s in se: 
					if (NLP_word.words_counting(s) >= slen):
						sentences.append(s)

	return sentences


# splitting text into Sentences using NLTK tokenization 
开发者ID:Tony-Hao,项目名称:Valx,代码行数:22,代码来源:sentence.py

示例8: flesch_kincaid_reading_ease

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def flesch_kincaid_reading_ease(text, token_count):
    """
    Takes a text and returns its FK Reading Ease
    :param text: A string text
    :param token_count: the number of tokens in the text
    :return: FK Reading Ease
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)

    def avg_syllables_per_word(text, token_count):
        syllable = syllable_count(text)
        if token_count > 0:
            return float(syllable) / float(token_count)
        else:
            return 0

    if len(nltk.sent_tokenize(text)) <= 0 or token_count <= 0:
        return 0

    ASL = float(token_count / len(nltk.sent_tokenize(text)))  # avg sentence length
    ASW = avg_syllables_per_word(text, token_count)
    FKRA = 206.835 - float(1.015 * ASL) - float(84.6 * ASW)
    return FKRA 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:26,代码来源:readability_indices.py

示例9: parse_gender

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        ) 
开发者ID:foxbook,项目名称:atap,代码行数:19,代码来源:gender.py

示例10: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def __init__(self, text, wordlistfolder):
        self.text = text
        self.tokens = nltk.word_tokenize(text)
        self.sentenses = nltk.sent_tokenize(text)
        self.tags = nltk.pos_tag(self.tokens)

        self.featspace = []

        self.psykfeatspace(self.featspace, wordlistfolder)
        self.bigrams(self.featspace)
        self.number_count(self.featspace)
        self.punc_count(self.featspace)
        self.big_word_count(self.featspace)
        self.words_per_sentence(self.featspace)
        self.sentence_count(self.featspace)
        self.countPOS(self.featspace, 'CC')
        self.countPOS(self.featspace, 'NP')
        self.countPOS(self.featspace, 'NNP')
        self.words(self.featspace)
        self.stem(self.featspace)

    # Counts a specific POS tags 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:24,代码来源:psykey.py

示例11: _from_text

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def _from_text(self, text, unit, tokenizer, language):

        if tokenizer is not None:
            if isinstance(tokenizer, str):
                tokens = re.findall(tokenizer, text)
            else:
                tokens = tokenizer.tokenize(text)
        else:
            import nltk

            @requires_nltk_corpus
            def tokenize_text(text):
                if unit == 'word':
                    return nltk.word_tokenize(text, language)
                elif unit.startswith('sent'):
                    return nltk.sent_tokenize(text, language)
                else:
                    raise ValueError(
                        "unit must be either 'word' or 'sentence'")

            tokens = tokenize_text(text)

        for i, t in enumerate(tokens):
            self._elements.append(TextStim(text=t, onset=None, duration=None,
                                  order=i)) 
开发者ID:tyarkoni,项目名称:pliers,代码行数:27,代码来源:text.py

示例12: parse_sgm

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def parse_sgm(self, sgm_path):
        with open(sgm_path, 'r') as f:
            soup = BeautifulSoup(f.read(), features='html.parser')
            self.sgm_text = soup.text

            doc_type = soup.doc.doctype.text.strip()

            def remove_tags(selector):
                tags = soup.findAll(selector)
                for tag in tags:
                    tag.extract()

            if doc_type == 'WEB TEXT':
                remove_tags('poster')
                remove_tags('postdate')
                remove_tags('subject')
            elif doc_type in ['CONVERSATION', 'STORY']:
                remove_tags('speaker')

            sents = []
            converted_text = soup.text

            for sent in nltk.sent_tokenize(converted_text):
                sents.extend(sent.split('\n\n'))
            sents = list(filter(lambda x: len(x) > 5, sents))
            sents = sents[1:]
            sents_with_pos = []
            last_pos = 0
            for sent in sents:
                pos = self.sgm_text.find(sent, last_pos)
                last_pos = pos
                sents_with_pos.append({
                    'text': sent,
                    'position': [pos, pos + len(sent)]
                })

            return sents_with_pos 
开发者ID:nlpcl-lab,项目名称:ace2005-preprocessing,代码行数:39,代码来源:parser.py

示例13: tokenize_sentences

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def tokenize_sentences(targets):
    while True:
        text = (yield)  # (yield) gets an item from an upstream step
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            for target in targets:
                target.send(sentence)  # send() sends data downstream 
开发者ID:PacktPublishing,项目名称:Hands-on-NLP-with-NLTK-and-scikit-learn-,代码行数:9,代码来源:nlp-6.1-nlp-pipeline.py

示例14: extract_wiki_sentences

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def extract_wiki_sentences(title, text, n_sentences, replace_title_mentions=''):
    """
    Extracts the first n_paragraphs from the text of a wikipedia page corresponding to the title.
    strip_title_mentions and replace_title_mentions control handling of references to the title in text.
    Oftentimes QA models learn *not* to answer entities mentioned in the question so this helps deal with this
    in the domain adaptation case.

    :param title: title of page
    :param text: text of page
    :param n_paragraphs: number of paragraphs to use
    :param replace_title_mentions: Replace mentions with the provided string token, by default removing them
    :return:
    """
    # Get simplest representation of title and text
    title = unidecode(title).replace('_', ' ')
    text = unidecode(text)

    # Split on non-alphanumeric
    title_words = re.split('[^a-zA-Z0-9]', title)
    title_word_pattern = '|'.join(re.escape(w.lower()) for w in title_words)

    # Breaking by newline yields paragraphs. Ignore the first since its always just the title
    paragraphs = [p for p in text.split('\n') if len(p) != 0][1:]
    sentences = []
    for p in paragraphs:
        formatted_text = re.sub(title_word_pattern, replace_title_mentions, p, flags=re.IGNORECASE)
        # Cleanup whitespace
        formatted_text = re.sub('\s+', ' ', formatted_text).strip()

        sentences.extend(nltk.sent_tokenize(formatted_text))

    return sentences[:n_sentences] 
开发者ID:Pinafore,项目名称:qb,代码行数:34,代码来源:cached_wikipedia.py

示例15: tokenize_train

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import sent_tokenize [as 别名]
def tokenize_train(train_directory,tokenized_directory):
    with codecs.open(train_directory, "r", "utf-8") as file:
	    with codecs.open(tokenized_directory, "w", "utf-8") as writer:
		    new_sens = []
		    for line in file:
			    sentences = sent_tokenize(line.strip())
			    for sen in sentences:

				    sen = word_tokenize(sen.lower())
				    new_sen = ' '.join(sen)
				    new_sens.append(new_sen)
				    writer.write(new_sen)
				    writer.write("\n")
    sentences = gensim.models.word2vec.LineSentence(tokenized_directory)
    return sentences 
开发者ID:Tian312,项目名称:EliIE,代码行数:17,代码来源:word2vec.py


注:本文中的nltk.sent_tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。