当前位置: 首页>>代码示例>>Python>>正文


Python PunktSentenceTokenizer.tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.PunktSentenceTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer.tokenize方法的具体用法?Python PunktSentenceTokenizer.tokenize怎么用?Python PunktSentenceTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.PunktSentenceTokenizer的用法示例。


在下文中一共展示了PunktSentenceTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
 def __init__(self,sentence):
    f = open('data/training_data', 'r')
    train_text=f.read()
    #data=open('data2','r')
    #test_data=data.read()
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    self.tokenized = custom_sent_tokenizer.tokenize(sentence)
开发者ID:codehacken,项目名称:Athena,代码行数:9,代码来源:nlp.py

示例2: POS_tagging

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def POS_tagging(corpus):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = corpus
    #print(train_text)
    custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)

    # textfile = open("POS_tagged",'w')
    # textfile.write(train_text)
    # textfile.write("\n\n\n\n\n\n\n\n\n\n")
    # print(custom_sentence_tokenizer)

    tokenized = custom_sentence_tokenizer.tokenize(sample_text)
    tuples_list = []
    def process_content():
        try:
            for i in tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                for w in tagged:
                    tuples_list.append(w)
        except Exception as e:
            c=0
            # print(str(e))
    process_content()
    return tuples_list
开发者ID:achuth-noob,项目名称:CHAT-BOT,代码行数:27,代码来源:C_F_testing.py

示例3: extractNounPhrases

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def extractNounPhrases(sentence):

    nounPhrases = []
    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        firstNN = False

        for tag in tagged:
            pos = tag[1]
            if "NN" in pos:
                if firstNN:
                    nounPhrase = firstNoun + " " + tag[0]
                    nounPhrases.append(nounPhrase)
                    firstNN = False
                    continue
                else:
                    firstNoun = tag[0]
                    firstNN = True
                    continue

            firstNN = False

    except Exception as e:
        print(str(e))

    return nounPhrases
开发者ID:robienoor,项目名称:NLTKForumScraper,代码行数:33,代码来源:naturalLanguageWhiz.py

示例4: Tokenizer

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
class Tokenizer(object):

    def __init__(self, language, normalize=False, train_text_gen=None):
        """
        A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
        Params:
        language: Language to tokenize (currently doesn't do anything)
        train_text_gen: A generator of training text for the sentence tokenizer.
        """
        self.language = language
        self.train_text_gen = train_text_gen
        self.normalize = normalize
        
        if train_text_gen:
            self.sent_tokenizer = self._train_sentence_tokenizer()
        else:
            self.sent_tokenizer = PunktSentenceTokenizer()

    def _train_sentence_tokenizer(self):
        return PunktSentenceTokenizer(train_text="\n".join(self.train_text_gen))

    def tokenize(self, text):
        tokenized = []
        for sentence in self.sent_tokenizer.tokenize(text):
            tokenized_sentence = []
            for word in word_tokenize(sentence):
                if self.normalize:
                    word = word.lower()
                tokenized_sentence.append(word)
            tokenized.append(tokenized_sentence)

        return tokenized
开发者ID:hihihippp,项目名称:plainstream,代码行数:34,代码来源:tokenizer.py

示例5: get_sentences

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
    def get_sentences(self, remove_url=True):
        '''
        generator
        :param remove_url --> replace URLs in sentences with one space char ;
        :return: tuple of sentences for each mime-part ;
        '''

        tokenizer = PunktSentenceTokenizer()

        for raw_line, mime_type, lang in tuple(self.get_text_mime_part()):

            if 'html' in mime_type:
                soup = BeautifulSoup(raw_line)
                if not soup.body:
                    continue
                # cause exactly sentences are needed, soup.body.strings returns lines+0d0a
                lines = tuple(soup.body.strings)
                raw_line = ''.join(lines)

            try:
                sents = tuple(tokenizer.tokenize(raw_line))
            except Exception as err:
                sents = tuple(raw_line)

            if remove_url:
                sents = tuple(map(lambda sent: self.__URLINTEXT_PAT.sub(' ', sent.lower()), sents))

            sents = (s.strip().lower() for s in sents)
            sents = tuple(s for s in tuple(sents) if s)
            if len(sents) == 0:
                continue

            yield sents
开发者ID:ml-course-stanford,项目名称:algos,代码行数:35,代码来源:msg_wrapper.py

示例6: normalize

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def normalize(text):
    p = PunktSentenceTokenizer()
    bullet1 = '\xe2\x80\xa2'.decode('utf-8')
    bullet2 = '\xc2\xb7'.decode('utf-8')
    usable = ''
    for sentence in p.tokenize(text):
        if len(sentence) < 500:
            if bullet1 not in sentence and bullet2 not in sentence:
                usable += '%s ' % sentence
    return usable
开发者ID:tristaneuan,项目名称:wikia-nlp,代码行数:12,代码来源:batch-named-entity-harvester.py

示例7: tokenize_english_document

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
开发者ID:ebu,项目名称:ebu-tt-live-toolkit,代码行数:56,代码来源:common.py

示例8: aristo_get_named_entities

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
 def aristo_get_named_entities(self, text):
     """
     Parses the texts to obtain named entities
     :param text: The text to parse
     :return:returns a named entity treexw
     """
     custom_sent_tokenizer = PunktSentenceTokenizer(text)
     tokenized = custom_sent_tokenizer.tokenize(text)
     for i in tokenized[5:]:
         words = nltk.word_tokenize(i)
         tagged = nltk.pos_tag(words)
         namedEnt = nltk.ne_chunk(tagged, binary=False)
         return ((namedEnt))
开发者ID:elangovana,项目名称:Aristo,代码行数:15,代码来源:text_analyser.py

示例9: tag

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def tag(sentence):

    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        return tagged

    except Exception as e:
        print(str(e))
开发者ID:robienoor,项目名称:NLTKForumScraper,代码行数:15,代码来源:naturalLanguageWhiz.py

示例10: name_ent_recog

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
    except Exception as e:
        print(str(e))
    return namedEnt
开发者ID:achuth-noob,项目名称:CHAT-BOT,代码行数:16,代码来源:join_sub_obj.py

示例11: sentenceTagging

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def sentenceTagging(text, trainingText):
    csTokenizer = PunktSentenceTokenizer(trainingText)
    tokenized = csTokenizer.tokenize(text)
    taggedSentence = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            taggedSentence.append(tagged)
            #chinkingWords(tagged).draw()
            namedEntityRecog(tagged)
    except Exception as e:
        print(str(e))

    return taggedSentence
开发者ID:subhodip,项目名称:hacktags,代码行数:17,代码来源:createTags.py

示例12: pos

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
	def pos(self, paragraph):

		wordsdict = collections.OrderedDict()
		sent_tokenizer = PunktSentenceTokenizer()

		for sentence in self.sent_detector.tokenize(paragraph):
			tokens = sent_tokenizer.tokenize(sentence)

			for token in tokens:
				words = nltk.word_tokenize(token)
				tagged = nltk.pos_tag(words)
				for word in tagged:
					if word[1] in self.tagdict:
						wordsdict[word[0]] = self.tagdict[word[1]][0]

		return wordsdict
开发者ID:ponrajuganesh,项目名称:POSTagger,代码行数:18,代码来源:analysis.py

示例13: main

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def main():
    training_text = state_union.raw('2005-GWBush.txt')
    sample_text = state_union.raw('2006-GWBush.txt')
    custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)

    choice = 0
    while choice < 5:
        choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
        if choice == 1:
            named_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 2:
            process_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 3:
            process_content(text_trained_tokenized(sample_text, training_text))
        elif choice == 4:
            print "try again, bitch!"
开发者ID:EricChristensen,项目名称:Python_Randomness,代码行数:19,代码来源:PosTagging.py

示例14: get_sentence_occurrences

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def get_sentence_occurrences(document, terms, doc_term=None):
    terms_present = get_document_occurrences(document, terms)

    # Use a Tokenizer from NLTK to build a sentence list
    tokenizer = Tokenizer(document)
    sentences = tokenizer.tokenize(document)
    
    # Create a list of lists containing the collection of terms which cooccurr
    # in a sentence
    occurrences = []
    for sentence in sentences:
        sentence_occurrences = set() 

        for term in terms_present:
            if term != doc_term:
                if re.search(' %s ' % term.label, sentence):
                    sentence_occurrences.add(term)
        

        if len(sentence_occurrences) > 0:
            sentence_occurrences = list(sentence_occurrences)
            to_remove = set()

            for inside in sentence_occurrences:
                for term in sentence_occurrences:
                    if term != inside and\
                        term.label.find(inside.label) != -1:
                        to_remove.add(inside)
            
            if to_remove:
                print "removing", to_remove

            for term in to_remove:
                sentence_occurrences.remove(term)

            if doc_term:
                sentence_occurrences.append(doc_term)

            occurrences.append(sentence_occurrences)
    
    return occurrences
开发者ID:camerontt2000,项目名称:inphosite,代码行数:43,代码来源:datamining.py

示例15: __init__

# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
class NER:
    """docstring for ClassName"""
    def __init__(self, query):
        self.original_query = query
       	conf = shelve.open('conf') 
        self.train_text = conf['train_text']
        self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
        self.tokenized = self.custom_sent_tokenizer.tokenize(self.original_query)

    def processContent(self):
        try:
            for i in self.tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                namedEnt = nltk.ne_chunk(tagged, binary=True)
                #print(namedEnt)
                #namedEnt.draw()
            return namedEnt
        except Exception as e:
            print(str(e))
        

    # Parse named entities from tree
    def structureNamedEntities(self):
    	ne = []
    	for subtree in self.named_entity_tree:
    		if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
    			ne_label = subtree.label()
    			ne_string = " ".join([token for token, pos in subtree.leaves()])
    			ne.append((ne_string, ne_label))
    	return ne

    def performNER(self):
        self.named_entity_tree = self.processContent()
        #print(type(self.named_entity_tree))
        self.named_entity_tuple = self.structureNamedEntities()
        #print(ne)
        names = [element[0] for element in self.named_entity_tuple]
        return names
开发者ID:Mitgorakh,项目名称:myproject,代码行数:41,代码来源:ner.py


注:本文中的nltk.tokenize.PunktSentenceTokenizer.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。