当前位置: 首页>>代码示例>>Python>>正文


Python PlaintextCorpusReader.paras方法代码示例

本文整理汇总了Python中nltk.corpus.PlaintextCorpusReader.paras方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextCorpusReader.paras方法的具体用法?Python PlaintextCorpusReader.paras怎么用?Python PlaintextCorpusReader.paras使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.PlaintextCorpusReader的用法示例。


在下文中一共展示了PlaintextCorpusReader.paras方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: compare

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
开发者ID:prashaantt,项目名称:savitri-labs,代码行数:31,代码来源:views.py

示例2: stats

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
开发者ID:prashaantt,项目名称:savitri-labs,代码行数:32,代码来源:views.py

示例3: extractParasInList

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def extractParasInList(name):
    corpuslocation ='/Users/anis/seniorProject/aligned Paragraphs/algebra'
    reader = PlaintextCorpusReader(corpuslocation, '.*\.txt')
    # This gives the list of paragraphs. every paragraph list contains ist of sentences
    # So it is a list of lists. Bunch of sentenses as a list joins together to make  		#lists of pararagraph
    pList = []
    paragraphlist =  reader.paras(name) #'simpleTuring.txt'
    numpara = len(paragraphlist)
    for sentlist in paragraphlist:
        #print sentlist
        numsent = len(sentlist)
        #print type(sentlist),
        #print numsent
     	paraAsAList = []
     	# this loops through all the sentence lists and make them one list'''
        for i in range(numsent):
        		paraAsAList = paraAsAList + sentlist[i]	
        #print paraAsAList # this is the whole parapragph as one list
     	paraAsAString = ""
     	for word in paraAsAList:
        		paraAsAString = paraAsAString + word + str(" ")
        #print paraAsAString
        pList.append(paraAsAString)
        #print len(pList)
    return pList
开发者ID:azaman13,项目名称:senior-project,代码行数:27,代码来源:tf_idfBak.py

示例4: train

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def train():

   wordlists = PlaintextCorpusReader('', file_path)

   st = stemmer()
   
   # Get blocks of text using NLTK
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # LOGIC
   #       If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as 
   #       [posi/nega]tive

   # Count words
   word_features = []

   # Go through paragraphs
   for p in paras:

      # Classify S
      score_positive_negative = 0
      for s in p:
         for word in s:

            word = st.stem(word)

            if word in words_positive:
               score_positive_negative += 1
            elif word in words_negative:
               score_positive_negative -= 1
   
      # Record class of paragraph for any words present
      for s in p:
         for word in s:

            word = st.stem(word)

            if score_positive_negative > 0:
               word_features.append( ({"word": word}, "+") )
            elif score_positive_negative < 0:
               word_features.append( ({"word": word}, "-") )
            else:
               word_features.append( ({"word": word}, " ") )

   # Create and return classifier
   classifier = nltk.NaiveBayesClassifier.train(word_features)
   return classifier
开发者ID:ace-n,项目名称:cs398vl-mp3,代码行数:51,代码来源:mp3-paragraph-classifier.py

示例5: main

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def main():

   st = stemmer()

   # Get data
   wordlists = PlaintextCorpusReader('', file_path)
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # Train
   classifier = train()

   # Get class probabilities (for MAP estimation)
   counts = {"P":0, "-":0, "N":0}
   for i in range(0,len(paras)):
      for s in paras[i]:

         score_pos = 0
         score_neg = 0

         # Classify paragraph
         for word in s:

            word = st.stem(word)

            feature = {"word":word}
            classified = classifier.classify(feature)

            if classified == "+":
               score_pos += 1
            elif classified == "-":
               score_neg += 1

         # Record result
         if score_pos > score_neg:
            counts["P"] += 1
         elif score_pos < score_neg:
            counts["N"] += 1
         else:
            counts["-"] += 1

   # Done!
   print counts
开发者ID:ace-n,项目名称:cs398vl-mp3,代码行数:46,代码来源:mp3-paragraph-classifier.py

示例6: extractParasInList

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def extractParasInList(name):
    corpuslocation ='/home/aniszaman/seniorProject/combined/carnivore'
    reader = PlaintextCorpusReader(corpuslocation, '.*\.txt')
    # This gives the list of paragraphs. every paragraph list contains ist of sentences
    # So it is a list of lists. Bunch of sentenses as a list joins together to make  		#lists of pararagraph
    pList = []
    paragraphlist =  reader.paras(name) #'simpleTuring.txt'
    numpara = len(paragraphlist)
    for sentlist in paragraphlist:
        numsent = len(sentlist)
     	paraAsAList = []
     	# this loops through all the sentence lists and make them one list'''
        for i in range(numsent):
     		paraAsAList = paraAsAList + sentlist[i]	
     	paraAsAString = ""
     	for word in paraAsAList:
        		paraAsAString = paraAsAString + word + str(" ")
        pList.append(paraAsAString)
    return pList
开发者ID:azaman13,项目名称:senior-project,代码行数:21,代码来源:similarity.py

示例7: main

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def main(print_out, motifs, chapter):
   wordlists = PlaintextCorpusReader('', 'Punctuated/pot_ch[12345]\.txt')

   #rep_words = nltk.FreqDist(brown.words()) # Get representative word counts

   st = LancasterStemmer()
   #st = RegexpStemmer('ing$|s$|e$', min=4)

   for i in range(1,6): 
   
      if i != chapter:
        continue   
   
      g = nx.Graph()

      words = wordlists.words('Punctuated/pot_ch{!s}.txt'.format(str(i)))
      paras = wordlists.paras('Punctuated/pot_ch{!s}.txt'.format(str(i)))

      # Generate HTML
      #with open("test" + str(i) + ".txt", "w+") as fi:
      #   output = generate_html_level2(wordlists, st, words, paras, i)
      #   fi.write(output)
      
      json_dict = {}
      json_dict["nodes"] = []
      json_dict["edges"] = []

      # Get correlation coefficients
      corr_data = get_corr_coefs(wordlists, st, words, paras, print_out, motifs)
      corr_coefs = corr_data[0]
      corr_freqs = corr_data[1]

      # ---------------------------------- NetworkX ----------------------------------
      # Get NetworkX nodes
      nx_added_nodes = []
      for m1 in corr_coefs:
         g.add_node(m1)

      # Get NetworkX edges
      for m1 in corr_coefs:
         for m2 in corr_coefs[m1]:

             # Avoid repeats
             if m1 <= m2:
                 continue

             g.add_edge(m1, m2)

      # -------------------------------- End NetworkX --------------------------------

      # -------------------------------------- d3.js --------------------------------------
      # Get d3-js nodes
      json_node_numbers = dict()
      square_size = 0
      for m1 in corr_coefs:

         sz = int(min(corr_freqs[m1]/3.0,50))*3
         #print sz

         json_node  = {
                                    "name": m1,
                                    "size": str(sz),
                                    "color": "#aaaaaa"
                                }
         json_dict["nodes"].append(json_node)
         json_node_numbers[m1] = len(json_node_numbers)

      # Get d3-js edges
      m1m2 = 0;
      for m1 in corr_coefs:
         for m2 in corr_coefs[m1]:

             # Avoid repeats
             if m1 <= m2:
                 continue

             # No need to worry about repeats, since corr_coefs won't contain them
             edge_size = corr_coefs[m1][m2]
             #print "ES " + m1 + "/" + m2 + ": " + str(edge_size)
             json_edge = {
                                   "name": m1 + "-" + m2,
                                   "source": json_node_numbers[m1],
                                   "target": json_node_numbers[m2],
                                   "size": str(edge_size)
                                   }
             json_dict["edges"].append(json_edge)

      # Add boundary d3-js node
      json_dict["nodes"].append({"id":"the-end",
                                                    "x":square_size,
                                                    "y":square_size,
                                                    "size":"1",
                                                    "color":"#000000"
      })

      # Write JSON to file
      if not print_out:
          with open("OFFICIAL/data" + str(i) + ".json", "w+") as fi:
            fi.write("var json_str_" + str(i) + "=" + json.dumps(json_dict, fi, indent = 2))
      else:
#.........这里部分代码省略.........
开发者ID:ace-n,项目名称:cs398vl-project,代码行数:103,代码来源:main.py

示例8: extract_data

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
    def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, 
            [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])            
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"
        
        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([numpy_filepath_pca,]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)
        
        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)
        
        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)
            
            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
开发者ID:BovineJoni,项目名称:StylometricClustering,代码行数:81,代码来源:data.py

示例9: PlaintextCorpusReader

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
NUM_WORDS = 20
wordlist = PlaintextCorpusReader('', 'ofk(_chap_[1234])?\.txt')

def clean_words(words):
    #convert everything to lower case
    words = [w.lower() for w in words]
    #remove period from end of sentences
    words =  [re.sub('\.','',w) for w in words]
    #only keep alphabetic strings
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if not w in stopwords.words('english')]
    #do stemming "goes" => "go"
    words = [nltk.PorterStemmer().stem(w) for w in words]
    return words

paras = [sum(para, []) for para in wordlist.paras('ofk.txt')]
words = clean_words(wordlist.words('ofk.txt'))
groups = []
for i in range(0, len(paras), GROUP_LENGTH):
    group = sum(paras[i : min(i + GROUP_LENGTH, len(paras))], [])
    groups.append(group)

freqs = []
for group in groups:
    freq = FreqDist(clean_words(group))
    table = {w:freq[w] for w in freq}
    freqs.append(table)

top_words = [w for w in FreqDist(words)][:NUM_WORDS]
def get_word_freqs(word):
    return {'word':word, 'values':[{'x':i, 'y':freqs[i].get(word, 0)} for i in range(len(freqs))]}
开发者ID:colegleason,项目名称:ofk,代码行数:33,代码来源:gen_data.py

示例10: len

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import wordnet as wn

# Grab stopwords.
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords) #127

# Read the plain text.
corpus_root = 'corpora'
aow = PlaintextCorpusReader(corpus_root, 'Art-of-War.txt')

aow.fileids() #['artofwar.txt']
aow.words() #['THE', 'ART', 'OF', 'WAR', 'BY', 'SUN', 'TZU', ...]
len(aow.words()) #13038
len(aow.sents()) #943
len(aow.paras()) #399
len([s for s in aow.sents() if 'enemy' in s]) #111
len([s for s in aow.sents() if 'enemies' in s]) #1
len([s for s in aow.sents() if 'ally' in s]) #2
len([s for s in aow.sents() if 'allies' in s]) #3
len([s for s in aow.sents() if 'spy' in s]) #8
len([s for s in aow.sents() if 'spies' in s]) #11

# Extract the list of sentences with /^enem(?:y|ies)$/i words.
enemy_sents = []
for s in aow.sents():
    for w in s:
        if w.lower().startswith('enem'):
            enemy_sents.append(s) # TODO Skip if seen.

len(enemy_sents) #126 XXX Can contain duplicates. Fix TODO above.
开发者ID:ology,项目名称:NLTK-Study,代码行数:33,代码来源:art-of-war.py

示例11: file

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
'''def beatles_corpus.sents(self, fileids=None):
    """
    :return: the given file(s) as a list of
        sentences or utterances, each encoded as a list of word
        strings.
    :rtype: list(list(str))
    """
    if self._sent_tokenizer is None:
        raise ValueError('No sentence tokenizer for this corpus')

    return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
                 for (path, enc, fileid)
                 in self.abspaths(fileids, True, True)])
'''
k = 0
custom_corpus = nltk.Text(beatles_corpus.words())
for i in beatles_corpus.paras():
    # Iterate through songs, printing out contents
    # print "Song # " + str( k )
    k += 1
    l = 0
    while l < len(i):
        # print "Line " + str(l)
        # print i[l]
        l += 1
    # for j in i:
    # print j[0]

# new_song = custom_corpus.generate(100)
开发者ID:shawnadelic,项目名称:joob,代码行数:31,代码来源:custom_corpus.py


注:本文中的nltk.corpus.PlaintextCorpusReader.paras方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。