当前位置: 首页>>代码示例>>Python>>正文


Python nltk.regexp_tokenize函数代码示例

本文整理汇总了Python中nltk.regexp_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python regexp_tokenize函数的具体用法?Python regexp_tokenize怎么用?Python regexp_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了regexp_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_freqs

def get_freqs(text):

    stop_words = nltk.corpus.stopwords.words('english')
    frequencies = defaultdict(int)

    pattern = r'''(?x)    # set flag to allow verbose regexps
                    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
                    | \w+(-\w+)*        # words with optional internal hyphens
                    | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
                    | \.\.\.            # ellipsis
                    | [][.,;"'?():-_`]  # these are separate tokens
                     '''

    if type(text) == list:
        print 'number of links: '+ str(len(text))
        for t in text:
            content = t['content']
            tokens = nltk.regexp_tokenize(content, pattern)
            for word in tokens:
                if len(word) > 2 and word.lower() not in stop_words:
                    cap = word[0].upper() + word[1:]
                    frequencies[cap] += 1
    else:
        tokens = nltk.regexp_tokenize(text, pattern)
        for word in tokens:
            if len(word) > 2 and word not in stop_words:
                frequencies[word] += 1
    print "frequency size: "+str(len(frequencies))
    return frequencies
开发者ID:seemless,项目名称:chainlink,代码行数:29,代码来源:chainlink_util.py

示例2: bag_of_words

def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list
开发者ID:Juicechuan,项目名称:workspace,代码行数:33,代码来源:naive_bayes.py

示例3: load

def load(f=str):
    import re
    files = open(f)
    raw = files.read()
    pattern = re.compile(r"""\$?\d+(\.\d+)?%?    # currency
                             \d+/\d+/\d+         #dates""", re.VERBOSE)
    nltk.regexp_tokenize(raw, pattern)
开发者ID:MariaSpyropoulou,项目名称:NLTK-Book,代码行数:7,代码来源:Chapter3.py

示例4: nltkTest

def nltkTest():
    s = "russia licenza 8.1.5 U.S."
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)

    s = "Saldo vs. Fattura n. 2015/004"
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)
开发者ID:cynricshu,项目名称:ChinaVis2016,代码行数:8,代码来源:handleSubject.py

示例5: regularExpressionTokenizer

def regularExpressionTokenizer():
    text = 'That U.S.A. poster-print costs $12.40...'
    pattern = r'''(?x)         # set flag to allow verbose regexps 
            ([A-Z]\.)+        # abbreviations, e.g. U.S.A. 
          | \w+(-\w+)*        # words with optional internal hyphens 
          | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82% 
          | \.\.\.            # ellipsis 
          | [][.,;"'?():-_`]  # these are separate tokens 
    '''
    print nltk.regexp_tokenize(text, pattern)
开发者ID:hbdhj,项目名称:python,代码行数:10,代码来源:chapter3.py

示例6: get_links

def get_links(text):
    # checks only for  'http://...' and 'www...'
    text = text + " "
    pat = "http://.*?\s"
    links = nltk.regexp_tokenize(text, pat)
    text = " " + text + " "
    pat = "\swww\..*?\..*?\s"
    links.extend(nltk.regexp_tokenize(text, pat))
    links = map(lambda x: x[:-1], links)
    return links
开发者ID:ItsLastDay,项目名称:Twitter-language-identification,代码行数:10,代码来源:string_processing.py

示例7: poss_test

def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    stopwords = sw
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a == 0:
            a += 1
            continue
        if a%1000 == 0:
            print a    
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)

        #light stem
        #title = set([stem(word) for word in title])
        #body = set(body)
        #body = set([stem(word) for word in body])

        #remove stopwords
        #body = filter(g,body)
        #title = filter(g,title)

        body = ' '.join(body)
        title = ' '.join(title)
        t.write('%s , %s \n'%(title,body))
开发者ID:rve,项目名称:keyword,代码行数:53,代码来源:stem.py

示例8: poss_test

def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw 
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:

        if a%10000 == 0:
            print(a)
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"(\.?[a-z][a-z0-9\+\.\#\-]+[a-z0-9\+\#])"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])


        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s"\n'%(row[0],title,body))
开发者ID:rve,项目名称:keyword,代码行数:51,代码来源:nltk_without_stem.py

示例9: query_episode

    def query_episode(self, show_title, 
        ep_title, se_number, ep_number, runtime):
        """build video list prior to scoring
        """
        qres = {}

        # Query 1
        qlist = (show_title, ep_title)
        # Search YouTube
        tmp = self.search('%s %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 2
        qlist = (show_title, ep_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s %s  %s  %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 3
        qlist = (show_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s s%02de%02d' % qlist)
        for k, v in tmp.items():
            qres[k] = v

        # Show tokens
        sh_stem = [self._lancaster.stem(t) \
            for t in nltk.regexp_tokenize(
                show_title.encode('utf8'), r"\w+")]

        # Episode stem tokens if exist
        if ep_title:
            ep_stem = [self._lancaster.stem(t) \
                for t in nltk.regexp_tokenize(
                    ep_title.encode('utf8'), r"\w+")]
        else:
            ep_stem = None

        res = {'Output': qres, 
               'Input': {},}
        res['Input']['show_title'] = show_title
        res['Input']['ep_title'] = ep_title
        res['Input']['sh_stem'] = sh_stem
        res['Input']['ep_stem'] = ep_stem
        res['Input']['se_number'] = se_number
        res['Input']['ep_number'] = ep_number
        res['Input']['runtime'] = runtime

        return res
开发者ID:BrianDurham,项目名称:couchtube,代码行数:51,代码来源:ytquery.py

示例10: poss_train

def poss_train(train_file,train_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(train_file)
    reader = csv.reader(f)

    t = open(train_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw  # use nltk stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a%100000 == 0:
            print a    
        a += 1
        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #word tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        #st = LancasterStemmer()
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])

        # list to string
        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
开发者ID:rve,项目名称:keyword,代码行数:50,代码来源:pre_nltk.py

示例11: normalized

def normalized(text, lowercase=True, fix=True, tuples=False):
    """Tokenize, remove capitalization and exclude punctuation
    """
    if fix:
        text = fix_text(unicode(text))
    pattern = r"""(?x)    # verbose regexps
        \w+(-\w+)*        # words with optional internal hyphens
    """
    result = [w for w in nltk.regexp_tokenize(text, pattern)]
    if lowercase:
        result = [w.lower() for w in nltk.regexp_tokenize(text, pattern)]
    if tuples:
        result = tuple(result)
    return result
开发者ID:elyase,项目名称:eikon_challenge,代码行数:14,代码来源:utils.py

示例12: compute_df

 def compute_df(self, document_list):
     '''Compute document frequency based on input document list'''  
     df_cache = dict()
     df_output = dict()
     
     d_index = 0
     for document in document_list:
         d_index += 1
         # tokenize each document
         reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
         for item in reg_toks:
             # change each word to lower case and lemmatize
             item = normalise(item)
             if item not in df_cache:
                 df_cache[item] = set([d_index])
             else:
                 df_cache[item].add(d_index)
     
     for item in df_cache:
         if acceptable_word(item):
             df_output[item] = len(df_cache[item])
     
     df_output['total_document'] = len(document_list)
     
     return df_output
开发者ID:luotigerlsx,项目名称:DataAnalysis_ML,代码行数:25,代码来源:keyword_extract.py

示例13: main

    def main(self, text):
        """Breaks a single string into a tree using the grammar and returns
        the specified words as a string."""

        if text is None:
            return None

        try:
            text = text.encode("ascii", "ignore")
        except:
            text = text.decode("utf-8", "ignore").encode("ascii", "ignore")

        chunker = nltk.RegexpParser(grammar)

        toks = nltk.regexp_tokenize(text, sentence_re)
        postoks = nltk.tag.pos_tag(toks)

        #print postoks
        tree = chunker.parse(postoks)

        terms = self.get_terms(tree)

        words = self.get_words(terms)

        return words
开发者ID:hongyu89,项目名称:IndeedScraper,代码行数:25,代码来源:GrammarParser.py

示例14: generate_vocab

def generate_vocab(papers):
    """Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal.

    Args:
        papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract)

    Returns:
        list of strings: the list of tokens forming the vocabulary
    """
    sc = StringCleaner()

    # Generate author's vocabulary
    corpus = " ".join(p[1] + " " + p[2] for p in papers)
    # Cleaning
    corpus = sc.clean_string(corpus)
    # Tokenization
    pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?"
    #         we keep tokens that are words (with optional internal hyphens), acronyms and percentages
    tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english"))
    num_re = re.compile("^\d+$")
    tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens
    # Stemming
    porter = nltk.stem.PorterStemmer()

    return [porter.stem(t) for t in tokens]
开发者ID:tizot,项目名称:recom-system,代码行数:25,代码来源:dataset_tools.py

示例15: extract

 def extract(self, text):
     ''' Extract and freudify noun phrases from text, return all succesfully
     freudified noun phrases. '''
     
     toks = nltk.regexp_tokenize(text, self.sentence_re)
     postoks = nltk.tag.pos_tag(toks)
     tree = self.chunker.parse(postoks)
     terms = self._get_terms(tree)
     
     phrases = sets.Set()
     
     # Loop through all the noun phrases and try to freudify them.
     for term in terms:
         if (len(term)) < 2: continue
         changed = False
         context = ""
         phrase = []
         for part in term:
             word, tag = part
             word = word.encode('ascii', 'replace')
             phrase.append(word.lower())
             rpl = self.replace_word(tag[:2], word)
             if len(rpl[2]) > 0:
                 context = rpl[2]
                 phrase[-1] = rpl[0]
                 changed = True
         if changed:
             phrase = " ".join(phrase).strip()
             phrase.encode('ascii', 'replace')
             phrase = str(phrase)
             if phrase not in self.own_phrases[context]:
                 phrases.add((str(phrase), context))    
       
     phrases = list(phrases)      
     return phrases
开发者ID:assamite,项目名称:agentwordgame,代码行数:35,代码来源:freud.py


注:本文中的nltk.regexp_tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。