当前位置: 首页>>代码示例>>Python>>正文


Python PunktSentenceTokenizer.sentences_from_text方法代码示例

本文整理汇总了Python中nltk.tokenize.punkt.PunktSentenceTokenizer.sentences_from_text方法的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer.sentences_from_text方法的具体用法?Python PunktSentenceTokenizer.sentences_from_text怎么用?Python PunktSentenceTokenizer.sentences_from_text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.punkt.PunktSentenceTokenizer的用法示例。


在下文中一共展示了PunktSentenceTokenizer.sentences_from_text方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: sentence_tokenizer

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
    def sentence_tokenizer(self, untokenized_string, language):
        """Reads language .pickle for right language"""
        if language == 'greek':
            pickle_path = os.path.expanduser('~/cltk_data/greek/cltk_linguistic_data/tokenizers/sentence/greek.pickle')
            language_punkt_vars = PunktLanguageVars
            language_punkt_vars.sent_end_chars = ('.', ';')
            language_punkt_vars.internal_punctuation = (',', '·')
        elif language == 'latin':
            pickle_path = os.path.expanduser('~/cltk_data/latin/cltk_linguistic_data/tokenizers/sentence/latin.pickle')
            language_punkt_vars = PunktLanguageVars
            language_punkt_vars.sent_end_chars = ('.', '?', ':')
            language_punkt_vars.internal_punctuation = (',', ';')
        else:
            print("No sentence tokenizer for this language available.")

        with open(pickle_path, 'rb') as open_pickle:
            tokenizer = pickle.load(open_pickle)
        tokenizer.INCLUDE_ALL_COLLOCS = True
        tokenizer.INCLUDE_ABBREV_COLLOCS = True
        params = tokenizer.get_params()
        sbd = PunktSentenceTokenizer(params)
        tokenized_sentences = []
        for sentence in sbd.sentences_from_text(untokenized_string,
                                                realign_boundaries=True):
            tokenized_sentences.append(sentence)
        return tokenized_sentences
开发者ID:smargh,项目名称:cltk,代码行数:28,代码来源:tokenize_sentences.py

示例2: featureize

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def featureize(F, observation_files):

    word_tokenizer = PunktSentenceTokenizer()
    sent_tokenizer = PunktSentenceTokenizer()

    m = len(observation_files)

    # X is Nx2
    X = np.zeros((m,2), dtype=np.float)

    for (i,filename) in enumerate(observation_files,start=0):

        file_text  = read_file(filename).decode('string_escape')

        try:
            num_sents = len(sent_tokenizer.sentences_from_text(file_text))
        except UnicodeDecodeError:
            num_sents = 2

        #num_tokens = len(word_tokenize(file_text))
        num_tokens = len(file_text.split())

        # Return two features: 
        # 1 (0) - Number of sentences per file
        # 2 (1) - Number of tokens per file
        X[i][0] = num_sents
        X[i][1] = num_tokens

    return X
开发者ID:mikeswoods,项目名称:cis530-project,代码行数:31,代码来源:sentence_info.py

示例3: tokenize

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
 def tokenize(self):
     """
     Returns a list of tokenized sentences
     """
     sentence_tokenizer = PunktSentenceTokenizer()
     sentences = sentence_tokenizer.sentences_from_text(self.text)
     sentences = [sentence.split() for sentence in sentences]
     sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences]
     return sentences
开发者ID:dkush,项目名称:bluestocking,代码行数:11,代码来源:parse.py

示例4: preprocess_doc

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def preprocess_doc(doc):
    sentence_tokenizer = PunktSentenceTokenizer()
    sentences = sentence_tokenizer.sentences_from_text(doc)    
    tokens = []
    for sentence in sentences:
        #sentence1 = sentence.split()
        sentence1 = neg_scope(sentence)
        tokens.extend(w for w in sentence1 if w.lower() not in stopwords.words("english"))
    for ii in xrange(len(tokens)):
        if tokens[ii][-1] == '.':
            tokens[ii] = tokens[ii][:-1]
    return tokens
开发者ID:dwins,项目名称:bluestocking,代码行数:14,代码来源:ShallowConsistency.py

示例5: GCBlockExtractor

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
class GCBlockExtractor(ExtractionMapper):

    def __init__(self):
        super(GCBlockExtractor, self).__init__(
            extraction_function=self._blocks_from_text)
        self.tokenizer = PunktSentenceTokenizer()

    def _blocks_from_text(self, page):
        blocks = []
        for sentence in self.tokenizer.sentences_from_text(
                page.text.replace('\n', '')):
            if sentence.strip():
                blocks.append(len(sentence))
            # maybe count tokens? or non-spaces?
        return blocks
开发者ID:christianbuck,项目名称:CorpusMining,代码行数:17,代码来源:scorer.py

示例6: tokenize_sents_latin

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def tokenize_sents_latin(sentences_string):
    global tokenenized_sentences
    """Tokenize a Latin string into sentences"""
    pickle_name = 'latin.pickle'
    pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', pickle_name)
    with open(pickle_path, 'rb') as f:
        train_data = pickle.load(f)
    train_data.INCLUDE_ALL_COLLOCS = True
    train_data.INCLUDE_ABBREV_COLLOCS = True
    params = train_data.get_params()
    sbd = PunktSentenceTokenizer(params)
    tokenenized_sentences = []
    for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True):
        tokenenized_sentences.append(sentence)
    #print(tokenenized_sentences)
    return tokenenized_sentences
开发者ID:AmitShilo,项目名称:cltk,代码行数:18,代码来源:sentence_tokenizer.py

示例7: raw_records

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def raw_records(crawl_collection,start):

  # Prepare a naive sentence tokeniser utility
  pst = PunktSentenceTokenizer()

  for rec in crawl_collection.query({'downloaded': True},field=None,skip=start):
    _id     = rec['_id']
    if rec['content'] is None:
      continue
    content = rec['content']['contents']
    # A wiki page may probably comprise of multiple content
    for c in content:
      # Explode a long topic into list of sentences
      sentences = pst.sentences_from_text(c)
      for s in sentences:
        yield (_id,s)
开发者ID:starcolon,项目名称:vor-knowledge-graph,代码行数:18,代码来源:create_pos_patterns.py

示例8: add_sents

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def add_sents(invid=None):
    if invid:
        findObj = {"_id": invid}
    else:
        findObj = {}
    for vd in vdigests.find(findObj):
        if not vd.get("nSentences") and vd.get('alignTrans') and vd.get('alignTrans').get('words'):
            twords = vd['alignTrans']['words']
            twords_len = len(twords)
            trans = " ".join([wrd["word"] for wrd in twords])
            STokenizer = PunktSentenceTokenizer()
            token_sents = STokenizer.sentences_from_text(trans)
            cwct = 0
            sentct = 0
            curword = twords[cwct]
            for tsent in token_sents:
                tswords = tsent.split(" ")
                for wnum, tsword in enumerate(tswords):
                    if tsword == curword["word"]:
                        curword["sentenceNumber"] = sentct
                        cwct += 1
                        if cwct < twords_len:
                            curword = twords[cwct]
                    else:
                        print "warning: not a one-to-one match: ", curword["word"], tsword
                        if wnum == 0:
                            curword["sentenceNumber"] = sentct - 1
                            cwct += 1
                            if cwct < twords_len:
                                curword = twords[cwct]
                        elif wnum == len(tswords) - 1:
                            curword["sentenceNumber"] = sentct
                        else:
                            ipdb.set_trace()
                sentct += 1
            vd['nSentences'] = len(token_sents)
            # write the separated sentences to file
            ssout_name = "ss-" + vd["_id"]
            outf = open("../ffdata/rawtrans/" + ssout_name, 'w')
            outf.write("\n".join(token_sents))
            outf.close()
            vd['sentSepTransName'] = ssout_name
            vdigests.save(vd)
开发者ID:strob,项目名称:vdigests,代码行数:45,代码来源:add_sentences.py

示例9: tokenize_greek_sentences

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def tokenize_greek_sentences(sentences_string):
    global tokenenized_sentences
    pickle_name = 'greek.pickle'
    pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', pickle_name)
    with open(pickle_path, 'rb') as f:
        train_data = pickle.load(f)
    train_data.INCLUDE_ALL_COLLOCS = True
    train_data.INCLUDE_ABBREV_COLLOCS = True
    params = train_data.get_params()
    sbd = PunktSentenceTokenizer(params)
    '''
    with open(input_file) as f:
        to_be_tokenized = f.read()
    '''
    tokenenized_sentences = []
    for sentence in sbd.sentences_from_text(sentences_string, realign_boundaries=True):
        tokenenized_sentences.append(sentence)
    #print(tokenenized_sentences)
    return tokenenized_sentences
开发者ID:AmitShilo,项目名称:cltk,代码行数:21,代码来源:sentence_tokenizer.py

示例10: export_crawl_to_text

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def export_crawl_to_text(mineDB):
  
  # Prepare a naive sentence tokeniser utility
  pst = PunktSentenceTokenizer()

  text_path = os.path.realpath('./mine.txt')

  with codecs.open(text_path, 'w', 'utf-8') as f:
    m = 0
    for wiki in mineDB.query({'downloaded': True},field=None):
      
      # Skip empty content or the added one
      if wiki['content'] is None or 'added_to_graph' in wiki:
        continue

      content = wiki['content']

      # A wiki page may probably comprise of multiple content
      for c in content['contents']:
        # Explode content into sentences
        sentences = pst.sentences_from_text(c)
        print('... content #{} ==> {} sentences extracted.'.format(m, len(sentences)))

        for s in sentences:
          # Cleanse the sentence
          s_ = cleanse(s)
          # Filter out noise by length
          if len(s_)<5 or len(s_.split(' '))<3:
            continue
          f.write(s_.lower() + '\n')

      m += 1

      if m>=args['limit']:
        print(colored('[Ending] Maximum number of topics reached.','yellow'))
        break

  return text_path
开发者ID:starcolon,项目名称:vor-knowledge-graph,代码行数:40,代码来源:build_wordvec.py

示例11: GaleChurchAlignmentDistance

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
class GaleChurchAlignmentDistance(DistanceScorer):

    def __init__(self):
        self.name = "Gale Church Alignment Scorer"
        self.tokenizer = PunktSentenceTokenizer()
        self.sblocks, self.tblocks = [], []

    def _blocks_from_text(self, text):
        blocks = []
        for sentence in self.tokenizer.sentences_from_text(
                text.replace('\n', '')):
            blocks.append(len(sentence))
            # maybe count tokens? or non-spaces?
        return blocks

    def _extract(self, source_corpus, target_corpus):
        for url, page in source_corpus.iteritems():
            self.sblocks.append(self._blocks_from_text(page.text))
        for url, page in target_corpus.iteritems():
            self.tblocks.append(self._blocks_from_text(page.text))

    def _score_pair(self, s_idx, s_page, t_idx, t_page):
        return self.gc.align_score(self.sblocks[s_idx], self.tblocks[t_idx])
开发者ID:christianbuck,项目名称:CorpusMining,代码行数:25,代码来源:scorer.py

示例12: iter_topic

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
def iter_topic(crawl_collection,start):
  
  # Prepare a naive sentence tokeniser utility
  pst = PunktSentenceTokenizer()
  
  n = 0
  
  for wiki in crawl_collection.query({'downloaded': True},field=None,skip=start):
    
    # Skip empty content or the added one
    if wiki['content'] is None or 'added_to_graph' in wiki:
      continue

    m = 0
    content = wiki['content']
    
    if args['verbose']:
      print(colored('[Extracting wiki] : ','cyan'), content['title'])
    
    # A wiki page may probably comprise of multiple content
    for c in content['contents']:
      # Explode a long topic into list of sentences
      sentences = pst.sentences_from_text(c)
      for s in sentences:
        m += 1
        yield (content['title'],s.split(' '))

    # After all sentences are processed,
    # mark the current wiki record as 'processed'
    crit = {'_id': wiki['_id']}
    crawl_collection.update(crit, {'$set':{'added_to_graph':True}})

    n += 1
    if args['verbose']:
      print(content['title'] + " processed with {0} nodes.".format(m))
      print(colored("{0} wiki documents processed so far...".format(n),'blue'))
开发者ID:starcolon,项目名称:vor-knowledge-graph,代码行数:38,代码来源:build_knowledge.py

示例13: SimhashDistance

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
class SimhashDistance(DistanceScorer):
    CHAR, TOKEN = range(2)

    def __init__(self, source_tokenizer, target_tokenizer, n=2, level=TOKEN):
        self.name = "Simhash Distance Scorer, n=%d" % n
        self.sentence_splitter = PunktSentenceTokenizer()
        self.s_hashes, self.t_hashes = [], []

        self.source_tokenizer = source_tokenizer
        if not source_tokenizer:
            self.source_tokenizer = SpaceTokenizer()

        self.target_tokenizer = target_tokenizer
        if not target_tokenizer:
            self.target_tokenizer = SpaceTokenizer()

        def ngrams(n, tokenizer, page):
            result = []
            text = page.text.replace('\n', '')
            for sentence in self.sentence_splitter.sentences_from_text(text):
                if not sentence.strip():
                    continue
                # if '\n' in sentence:
                #     print repr(sentence)
                assert '\n' not in sentence, sentence
                words = tokenizer.process(sentence).strip().split()
                result += [" ".join(words[i:i + n]) for i in
                           range(max(len(words) - n + 1, 1))]
            return result

        def tokens(n, tokenizer, page):
            # 180/1grams
            # words = page.html.split()
            words = filter(None, re.split("[^0-9a-zA-Z]", page.text))
            return [" ".join(words[i:i + n]) for i in
                    range(max(len(words) - n + 1, 1))]

        def chars(n, tokenizer, page):
            s = "".join(page.text.split())
            return [" ".join(s[i:i + n]) for i in
                    range(max(len(s) - n + 1, 1))]

        def html_tokens(n, tokenizer, page):
            # 153/trigrams
            words = page.html.split()
            return [" ".join(words[i:i + n]) for i in
                    range(max(len(words) - n + 1, 1))]

        if level == SimhashDistance.TOKEN:
            self.source_features = partial(tokens, n, self.source_tokenizer)
            self.target_features = partial(tokens, n, self.target_tokenizer)
        elif level == SimhashDistance.CHARS:
            self.source_features = partial(chars, n, self.source_tokenizer)
            self.target_features = partial(chars, n, self.target_tokenizer)
        # self.source_features = partial(ngrams, n, self.source_tokenizer)
        # self.target_features = partial(ngrams, n, self.target_tokenizer)
        # print self.source_features("How are you?\nI am fine. Thanks.")

    def _words_from_text(self, text, tokenizer):
        words = set()
        for line in self.sentence_splitter(text):
            for w in tokenizer.process(line).split("\n"):
                words.add(w)
        return words

    def _extract(self, source_corpus, target_corpus):
        for url, page in source_corpus.iteritems():
            self.s_hashes.append(Simhash(self.source_features(page)))
        for url, page in target_corpus.iteritems():
            self.t_hashes.append(Simhash(self.target_features(page)))

    def _score_pair(self, s_idx, s_page, t_idx, t_page):
        return -self.s_hashes[s_idx].distance(self.t_hashes[t_idx])

    def get_features(self, text):
        width = 3
        text = self.tokenizer.sentences_from_text(text)
        return [text[i:i + width] for i in
                range(max(len(text) - width + 1, 1))]
开发者ID:christianbuck,项目名称:CorpusMining,代码行数:81,代码来源:scorer.py

示例14: len

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import sentences_from_text [as 别名]
ref_list = " ".join(ref)

line = sent_detector.tokenize(ref_list.strip())


author_name= []
year_of_pub= []
paper_name=[]
journal_name=[]

year_found = False
req_idx = 1
for i in Reference:
    line = sent_detector.tokenize(i.strip())
    line2 = sent_detector.sentences_from_text(i.strip() )
    References.append(line)
    line3 = [x for x in line if x != "."]
    if len(line3)==4:
        j=0        
        author_name.append(line3[j])
        year_of_pub.append(line3[j+1])
        paper_name.append(line3[j+2])
        journal_name.append(line3[j+3])
    else:
        name_str = []
        regex = re.compile("(\d{4})")
        idx=0
        req_idx = 1
        while(idx<len(line3)):
            result = re.findall(regex,line3[idx])
开发者ID:sidhartha4,项目名称:OCR,代码行数:32,代码来源:find_ref.py


注:本文中的nltk.tokenize.punkt.PunktSentenceTokenizer.sentences_from_text方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。