当前位置: 首页>>代码示例>>Python>>正文


Python PlaintextCorpusReader.sents方法代码示例

本文整理汇总了Python中nltk.corpus.PlaintextCorpusReader.sents方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextCorpusReader.sents方法的具体用法?Python PlaintextCorpusReader.sents怎么用?Python PlaintextCorpusReader.sents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.PlaintextCorpusReader的用法示例。


在下文中一共展示了PlaintextCorpusReader.sents方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def main():
	current_directory = os.path.dirname(__file__)
	corpus_root = os.path.abspath(current_directory)
	wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
	wordlists.fileids()
	ClassEvent = nltk.Text(wordlists.words())
	CEWords = ["Long Island", "Weather Service", "flooding", "August", 
		"heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall",
		"record"]

	# ClassEvent Statistics
	print "--------- CLASS EVENT STATISTICS -------------"
	print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)	
	print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(ClassEvent)
	print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(ClassEvent, CEWords)
	
	ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
	ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
	ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
		ClassEventWordsPerSent) - 21.43
	
	print "Average number of letters per word", ClassEventLettersPerWord
	print "Average number of words per sentence:", ClassEventWordsPerSent
	print "Automated Readability Index:", ClassEventARI


	print 

	wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt")
	wordlists_event.fileids()
	YourSmall = nltk.Text(wordlists_event.words())
	SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", 
		"Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home",
		"Weather", "Debris", "Death", "Smoke", "State", "Ash"]
	

	# YourSmall statistics
	print "--------- YOUR SMALL STATISTICS --------------"
	print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
	print "YourSmall WORD LENGTH DISTRIBUTIONS:"
	print_word_length_distributions(YourSmall)
	print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
	print_percentage_of_word_in_collection(YourSmall, SmallEventWords)
	
	YourSmallLettersPerWord = average_letters_per_word(YourSmall)
	YourSmallWordsPerSent = len(wordlists_event.words()) / \
		len(wordlists_event.sents())
	YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
		YourSmallWordsPerSent) - 21.43

	print "Average number of letters per word", YourSmallLettersPerWord
	print "Average number of words per sentence:", YourSmallWordsPerSent
	print "Automated Readability Index", YourSmallARI
开发者ID:jplahn,项目名称:NLP-Capstone,代码行数:57,代码来源:Statistics.py

示例2: textinfo

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def textinfo(path):
    """
    Takes a file path and returns figures about the text file contained therein.
    """
    
    from nltk.corpus import PlaintextCorpusReader
    from nltk import FreqDist
    corpusReader = PlaintextCorpusReader(text, '.*')

    print "Total word count:", len([word for sentence in corpusReader.sents() for word in sentence])
    print "Unique words:", len(set(corpusReader.words()))
    print "Sentences:", len(corpusReader.sents())
    print "Average sentence length in words:", (len([word for sentence in corpusReader.sents() for word in sentence]) / len(corpusReader.sents()))
开发者ID:cmstewart,项目名称:galv,代码行数:15,代码来源:q1.py

示例3: get_coarse_level_features

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def get_coarse_level_features(dataset, output_file):
	# Import the corpus reader
	corpus_root = '/home1/c/cis530/data-hw2/'+dataset
	# Define the folder where the files are situated
	files_dataset = PlaintextCorpusReader(corpus_root, '.*')
	# Open the output_file
	output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
	# Read the stopwlist
	stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read()
	types_stop_list=stop_list.split()
	for fileid in files_dataset.fileids():
		# Output the docid
		output.write(dataset+'/'+fileid+' ')
		# Output the topic_name
		topic_name=fileid.split('/')[0]	
		output.write(topic_name+' ')
		# Output the num_tokens	
		tokens=files_dataset.words(fileid)
		output.write('tok:'+str(len(tokens))+' ')
		# Output the num_types
		types=set(tokens)
		output.write('typ:'+str(len(types))+' ')
		# Output the num_contents
		output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ')
		# Output the num_sents
		sents = files_dataset.sents(fileid)
		output.write('sen:'+str(len(sents))+' ')
		# Output the avg_slen
		avg_slen=round(float(len(tokens))/float(len(sents)),2)
		output.write('len:'+str(avg_slen)+' ')
		# Output the num_caps
		output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z'])))
		output.write('\n')
	output.close()
开发者ID:gabhi,项目名称:new-york-times-summarization,代码行数:36,代码来源:topic-classification.py

示例4: compare

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
开发者ID:prashaantt,项目名称:savitri-labs,代码行数:31,代码来源:views.py

示例5: stats

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
开发者ID:prashaantt,项目名称:savitri-labs,代码行数:32,代码来源:views.py

示例6: main

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def main():
    corpus_root = '../posts/'
    newcorpus = PlaintextCorpusReader(corpus_root, '.*',
                                      para_block_reader=read_block_no_metadata)
    corpus_words = [w.lower() for w in newcorpus.words() if w.isalpha()]
    corpus_sentences = newcorpus.sents()
    analyst = TextAnalyst(corpus_words, corpus_sentences, 'french')
    analyst.print_analyze()
开发者ID:Raveline,项目名称:journal-imaginaire,代码行数:10,代码来源:analyst.py

示例7: extractPossibleTerms

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def extractPossibleTerms(root, fileids):
    # get corpus
    #root, filename = os.path.split(path)
    reader = PlaintextCorpusReader(root, fileids)
    # get chunker
    grammar = 'NP: {<JJ>*<NNP>*<NN>*}'
    chunker = RegexpParser(grammar)
    # get terms
    terms = set()
    print len(reader.sents())
    i = 0
    for sent in reader.sents():
        i += 1
        if i%100==0:
            print i
        tree = chunker.parse(pos_tag(sent))
        for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node
            terms.add(' '.join([el[0] for el in t]))
    return terms
开发者ID:AdamMeyers,项目名称:The_Termolator,代码行数:21,代码来源:chunker.py

示例8: get_coarse_level_features

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def get_coarse_level_features(dataset, output_file):
# accessing the corpus
    corpus_root = '/home1/c/cis530/data-hw2/' 
    dataset_path = corpus_root + dataset

# Reading the files from the directories
    files = PlaintextCorpusReader(dataset_path, '.*')
    ids = files.fileids()
    stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt')
    stops = stopFile.words()

#Opening a file that has to be written to
    out = open(output_file, 'w')

    for i in range(0,len(ids) - 1):
#Initializing certain variables
        tokens_count=0
        types = 0
        non_stops_count=0
        sents_count = 0
        avg_sent_len=0
        cap_count = 0

        tokens=files.words(ids[i])
#Computing Number of Tokens
        tokens_count = len(tokens)

#Computing Number of types
        types = len(set(tokens))
        non_stops=[]

#Computing Number of Content Words
        for t in tokens:
            if t not in stops:
                non_stops.append(t)
        non_stops_count = len(non_stops)

#Finding Average Sentence Length
        sent = []
        sent = files.sents(ids[i])
        sents_count = len(sent)
        sent_len=0
        for s in sent:
            sent_len = sent_len + len(s)
        avg_sent_len = sent_len/float(sents_count)

#Computing Number of Captilized Words
        for c in non_stops:
            if c.istitle():
                cap_count = cap_count+1
        current_file = dataset + '/' + ids[i]
        e = current_file.split('/')
        out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \
str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n')
        out.flush()
开发者ID:madhuraraju,项目名称:NLP_Class_Code_Samples,代码行数:57,代码来源:CL_Two_Code_rmadhura.py

示例9: train

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def train():

   wordlists = PlaintextCorpusReader('', file_path)

   st = stemmer()
   
   # Get blocks of text using NLTK
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # LOGIC
   #       If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as 
   #       [posi/nega]tive

   # Count words
   word_features = []

   # Go through paragraphs
   for p in paras:

      # Classify S
      score_positive_negative = 0
      for s in p:
         for word in s:

            word = st.stem(word)

            if word in words_positive:
               score_positive_negative += 1
            elif word in words_negative:
               score_positive_negative -= 1
   
      # Record class of paragraph for any words present
      for s in p:
         for word in s:

            word = st.stem(word)

            if score_positive_negative > 0:
               word_features.append( ({"word": word}, "+") )
            elif score_positive_negative < 0:
               word_features.append( ({"word": word}, "-") )
            else:
               word_features.append( ({"word": word}, " ") )

   # Create and return classifier
   classifier = nltk.NaiveBayesClassifier.train(word_features)
   return classifier
开发者ID:ace-n,项目名称:cs398vl-mp3,代码行数:51,代码来源:mp3-paragraph-classifier.py

示例10: main

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def main():

   st = stemmer()

   # Get data
   wordlists = PlaintextCorpusReader('', file_path)
   words = wordlists.words(file_path)
   sents = wordlists.sents(file_path)
   paras = wordlists.paras(file_path)

   # Train
   classifier = train()

   # Get class probabilities (for MAP estimation)
   counts = {"P":0, "-":0, "N":0}
   for i in range(0,len(paras)):
      for s in paras[i]:

         score_pos = 0
         score_neg = 0

         # Classify paragraph
         for word in s:

            word = st.stem(word)

            feature = {"word":word}
            classified = classifier.classify(feature)

            if classified == "+":
               score_pos += 1
            elif classified == "-":
               score_neg += 1

         # Record result
         if score_pos > score_neg:
            counts["P"] += 1
         elif score_pos < score_neg:
            counts["N"] += 1
         else:
            counts["-"] += 1

   # Done!
   print counts
开发者ID:ace-n,项目名称:cs398vl-mp3,代码行数:46,代码来源:mp3-paragraph-classifier.py

示例11: classifyByYear

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
    def classifyByYear(self) :
        corpusReader = PlaintextCorpusReader(self.txtDirectory, ".*.txt", encoding = self.codec)

        for journal in corpusReader.fileids() :
            print ("Start " + journal)

            sentList = corpusReader.sents(journal)

            for sent in sentList :
                getMonth = False
                getDOI = False

                line = ''.join(sent)

                if self.doiURLTypes[0] in line :
                    getDOI = True
                    self._extractYearByDOI(self.doiURLTypes[0], journal, line)
                    break
                elif self.doiURLTypes[1] in line :
                    getDOI = True
                    self._extractYearByDOI(self.doiURLTypes[1], journal, line)
                    break

                for word in sent :
                    if getMonth :
                        self._extractYearByMonth(journal, word)
                        break

                    if word.lower() in self.dictMonth :
                        getMonth = True

                if getMonth :
                    getMonth = False
                    break
                elif getDOI :
                    getDOI = False
                    break

            print ("End " + journal)

        print (str(self.yearDirectoryList))
开发者ID:Ewha-Bio,项目名称:Genomics-Informatics-Corpus,代码行数:43,代码来源:gi_datautil.py

示例12: get_sentences_for_text

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def get_sentences_for_text(corpus_root, filename, lang="english"):
    """Segments the given text into sentences.

  Args:
    corpus_root: Directory in which the text file is residing.
    filename: Name of the text file.
    lang: Tokenizer language. For possible values, look at:
    ${NLTK_DATA}/tokenizers/punkt

  Returns:
    Sentences in the given text. 

  """
    tokenizer_path = "tokenizers/punkt/" + lang + ".pickle"
    text = PlaintextCorpusReader(
        corpus_root,
        [filename],
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=nltk.data.LazyLoader(tokenizer_path),
    )
    return text.sents()
开发者ID:ufal,项目名称:wiki-error-corpus,代码行数:23,代码来源:utils.py

示例13: network

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def network(chapter):
	if(chapter == 0):
		NEs = open("finalNEs/finalNEs.txt").read().split('\n')
		text_raw = open("ofk.txt").read()
	else:
		NEs = open("finalNEs/finalNEs_ch" + str(chapter) + ".txt").read().split('\n')
		text_raw = open("ofk_ch" + str(chapter) + ".txt").read()
	result = [dict(name="", relations=[""])]
	for NE in NEs:
		result.append(dict(name=NE, relations=[""]))

	# The next line is needed because of the extra blank list elements at the beginning and end (Beginning I added, end added from newlines in finalNEs.txt)
	result = result[1:len(result)-1]
	corpus = PlaintextCorpusReader('.', 'ofk\.txt')
	sentences = corpus.sents()
	for x in range(len(sentences)):
		for NEdict in result:
			if NEdict["name"] in sentences[x]:
	# 			# We are in a sentence with a named entity
				for n in result:
					if n["name"] in sentences[x] and n["name"] != NEdict["name"]:
						NEdict["relations"].append(n["name"])
	for NEdict in result:
		NEdict["relations"] = Set(NEdict["relations"][1:])
	final = [dict(name=r["name"], imports=list(r["relations"]), url=r["name"]+".html") for r in result]
	for finals in final:
		with open("../webpage/" + finals["name"] + ".html", "w") as f1:
			with open("part1.html") as f:
				for line in f:
					f1.write(line)
				f1.write(finals["name"])
			with open("part2.html") as f:
				for line in f:
					f1.write(line)
				f1.write("\tmain(\"data/" + finals["name"] + ".json" + "\");\n</script>")

	with open("../webpage/data/edgeBundle.json",'w') as outfile:
		json.dump(final,outfile, sort_keys = True, indent = 4, ensure_ascii=False)
开发者ID:alv53,项目名称:Alvin_CS398VL_MP3,代码行数:40,代码来源:edgeBundle.py

示例14: build_graph

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def build_graph(folder, file_pattern):
    corpus_root = os.getcwd() + "/" + folder
    print "Membuka korpus " + folder + " ..."
    word_lists = PlaintextCorpusReader(corpus_root, file_pattern)

    naskah = word_lists.sents()
    filelists = word_lists.fileids()
    teks = tokenize.sent_tokenize(word_lists.raw(fileids=filelists))

    print folder + " memiliki " + str(len(teks)) + ", " + str(len(naskah)) + " kalimat."

    G_result = nx.Graph()
    print "Membangun graf " + folder + " ..."
    for kalimat in naskah:
        kata = kalimat[0]
        prevToken = kata.lower()
        for idx in range(1, len(kalimat)):
            kata = kalimat[idx]
            token = kata.lower()
            if containsLetter(token) and containsLetter(prevToken):
                G_result.add_edge(prevToken, token)
                prevToken = token

    return G_result
开发者ID:barliant,项目名称:krextown,代码行数:26,代码来源:graftempo.py

示例15: PlaintextCorpusReader

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/home/vivkul/Downloads/project'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
# wordlists.fileids()
# wordlists.words('questions.txt')
amrit=wordlists.words('allquestion.txt')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.book import *
fo=open("selectedquestion.txt","wb")
a=wordlists.sents('allquestion.txt')
while(len(amrit)!=0):
	content=[w for w in amrit if w.lower() not in stopwords]
	voc=FreqDist(content)
	# sorted([w for w in set(content) if len(w) > 2 and 4voc[w] > 3])
	# set_voc_0=FreqDist(a[0])
	# set_voc_1=FreqDist(a[1])
	b=voc.keys()
	i=0
	while(i<len(b)):
		if(len(b[i])>2):
			j=i
			max=b[i]
			break
		i=i+1
	q_no=[]
	k=0
	while(k<len(a)):
		set_voc=FreqDist(a[k])
		if(set_voc[max]>0):
			q_no.append(len([w for w in a[k] if w.lower() not in stopwords]))
开发者ID:triveni692,项目名称:hacku,代码行数:33,代码来源:extract.py


注:本文中的nltk.corpus.PlaintextCorpusReader.sents方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。