当前位置: 首页>>代码示例>>Python>>正文


Python PlaintextCorpusReader.raw方法代码示例

本文整理汇总了Python中nltk.corpus.PlaintextCorpusReader.raw方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextCorpusReader.raw方法的具体用法?Python PlaintextCorpusReader.raw怎么用?Python PlaintextCorpusReader.raw使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.PlaintextCorpusReader的用法示例。


在下文中一共展示了PlaintextCorpusReader.raw方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: train_computer_science

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def train_computer_science(save):
    comp_sci_corpus = PlaintextCorpusReader('{}/corpus/computerscience/'
                                            .format(os.path.dirname(os.path.abspath(__file__))), '.*')

    comp_sci_chunker = Chunker('computerscience', comp_sci_corpus.raw('train.txt'))
    chunk_score = comp_sci_chunker.evaluate(comp_sci_corpus.raw('test.txt'))

    print_chunk_score(chunk_score)

    if save:
        comp_sci_chunker.save_chunker()
开发者ID:Mo-Talha,项目名称:Nomad,代码行数:13,代码来源:train.py

示例2: tokenize_report_sents

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
 def tokenize_report_sents(self, report_of_the_time):
     re = ReportEnviroments()
     new_corpus_reports_fileids_list = PlaintextCorpusReader(re.original_reports_corpus_path, '.*')
     raw_text = new_corpus_reports_fileids_list.raw(report_of_the_time)
     sentencas_raw = sent_tokenize(raw_text)
     original_report_path = str(new_corpus_reports_fileids_list.abspath(report_of_the_time))
     return sentencas_raw, original_report_path, report_of_the_time
开发者ID:EduardoCarvalho,项目名称:nltkSegmenter,代码行数:9,代码来源:reportSegmenter.py

示例3: compare

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def compare(request):
    errors = []
    statistics=[]
    stats=[]
    for x in range(1,3):
           cantoname = "canto"+str(x)+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append(x)
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           stats.append(statistics)
           
    return render_to_response('compare.html', {'stats':statistics})
开发者ID:prashaantt,项目名称:savitri-labs,代码行数:31,代码来源:views.py

示例4: stats

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def stats(request):
    errors = []
    statistics=[]
    if 'q' in request.GET:
        q = request.GET['q']
        if not q:
            errors.append('Enter a Canto Number')
        else:
           cantoname = "canto"+q+".txt"
           w=PlaintextCorpusReader("./",cantoname);
           w.words();
           t=nltk.text.Text(w.words());
           l_lines=len(line_tokenize(w.raw()))
           l_uwords=len(set(w.words()))
           l_words=len(w.words())
           l_sents=len(w.sents())
           l_paras=len(w.paras())
           l_linperpara=l_lines/l_paras
           statistics.append("Number of Words - "+ str(l_words))
           statistics.append("Number of Unique Words - "+ str(l_uwords))
           statistics.append("Number of Setences - "+ str(l_sents))
           statistics.append("Number of Lines - "+ str(l_lines))
           statistics.append("Number of Paras - "+ str(l_paras))
           statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
           lexical_density=l_words/l_uwords
           l_wordpersent = l_words/l_sents
           statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
           statistics.append("Words per sentence - "+ str(l_wordpersent))
           return render_to_response('stats.html', {'statistics':statistics})
    return render_to_response('stats.html', {'errors': errors})
开发者ID:prashaantt,项目名称:savitri-labs,代码行数:32,代码来源:views.py

示例5: extract_related_terms

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
 def extract_related_terms(self):
     re = ReportEnviroments()
     new_corpus_clusters_fileids_list = PlaintextCorpusReader(re.cluster_corpus_path, '.*')
     raw_text_list = []
     for i in range(len(new_corpus_clusters_fileids_list.fileids())):
         raw_text_list.extend([[new_corpus_clusters_fileids_list.raw(fileids=new_corpus_clusters_fileids_list.fileids()[i])]])
     return raw_text_list
开发者ID:EduardoCarvalho,项目名称:nltkSegmenter,代码行数:9,代码来源:extractCluster.py

示例6: extractWordsOnly

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
    def extractWordsOnly(self, article):
        templist = []
        listtextstring = []
        articlename = article + '.txt'
        #corpus_root = '/home/jesal/onedump/'
        wl = PlaintextCorpusReader(corpus_root, '.*')
        allwords = wl.words(fileids = articlename)
        exturllist = self.extractexternalURL(article)
        textstring = wl.raw(articlename)
        for item in exturllist:
            textstring = textstring.replace(item,' ')
    

        
        #templist = re.sub(r'[.!,;?]', ' ', textstring).split()
        templist = nltk.word_tokenize(textstring)
        listtemp = []
        for i in templist:
        	j = re.sub('[^A-Za-z]+', '', i)
        	listtemp.append(str(j))
		    
		    
		    
		    
        templistfinal = []
        templistfinal= self.removeEmpty(listtemp)
        return templistfinal
开发者ID:Wiki-G,项目名称:wikiG-app,代码行数:29,代码来源:testImplemented.py

示例7: raw

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
	def raw(self, fileid):
		"""
			Returns raw text of fileid
			
			>>> hr.raw('1996/HAM2-960622.xml')[:38]
			'<?xml version="1.0" encoding="UTF-8"?>'
		"""		
		wordlists = PlaintextCorpusReader(self.hamshahri_root, fileid)
		return wordlists.raw(fileid)
开发者ID:alifars,项目名称:hazm,代码行数:11,代码来源:HamshahriReader.py

示例8: GetTweets

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def GetTweets():
    corpusdir = 'DB/'

    newCorpus = PlaintextCorpusReader(corpusdir, '.*\.txt$') #Regex allows you to ignore .DS_Store

    pattern = '\r\n' #Regex accepts \r\n as the next line encoding in each 'tweet' in the database
    tweets = nltk.regexp_tokenize(newCorpus.raw(), pattern, gaps=True) #iterate through list, creating 'tweets'
    tweets = [x.lower() for x in tweets] #make all strings lowercase to make matching easier
    return tweets
开发者ID:AndrewSB,项目名称:TwitterPMI,代码行数:11,代码来源:main.py

示例9: raw

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def raw():
	"""
		Returns raw text of corpus
		
		>>> raw()[:54]
		'#                                                 DELM'
	"""		
	wordlists = PlaintextCorpusReader(bijankhan_root, bijankhan_fileid)
	return wordlists.raw(bijankhan_fileid)
开发者ID:alifars,项目名称:hazm,代码行数:11,代码来源:BijankhanReader.py

示例10: process_nan

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def process_nan():
    corpus_root = '../nan_samples/'
    library = PlaintextCorpusReader(corpus_root, '.*', encoding='utf-8')
    tokens = nltk.word_tokenize(library.raw())
    tokens = map(lambda x: process_element(x), tokens)
    nan_tokens=[]
    for i in tokens:
        nan_tokens+=i.split(' ')
    return nan_tokens
开发者ID:cferko,项目名称:eldritch,代码行数:11,代码来源:saryn_3_backend.py

示例11: small_event_sentences

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def small_event_sentences():
	corpus_root = '../Texas_Wild_Fire/'

	wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')

	SmallEvent = wordlists.raw()

	sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')

	SmallEventSentences = sent_tokenizer.tokenize(SmallEvent)
	return SmallEventSentences
开发者ID:jplahn,项目名称:NLP-Capstone,代码行数:13,代码来源:SentenceTokenizer.py

示例12: class_event_sentences

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def class_event_sentences():
	corpus_root = '../Islip13Rain/'

	wordlists = PlaintextCorpusReader(corpus_root, ".*\.txt")

	ClassEvent = wordlists.raw()

	sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')

	ClassEventSentences = sent_tokenizer.tokenize(ClassEvent)
	return ClassEventSentences
开发者ID:jplahn,项目名称:NLP-Capstone,代码行数:13,代码来源:SentenceTokenizer.py

示例13: big_event_sentences

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def big_event_sentences():
	corpus_root = '../Brazil_NightClub_Fire/'

	wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')

	BigEvent = wordlists.raw()

	sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')

	BigEventSentences = sent_tokenizer.tokenize(BigEvent)
	return BigEventSentences
开发者ID:jplahn,项目名称:NLP-Capstone,代码行数:13,代码来源:SentenceTokenizer.py

示例14: carga

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def carga():
    client = pymongo.MongoClient(MONGODB_URI)
    db = client.docs
    docs=db.SIMILITUD

    completo=[]
    newcorpus = PlaintextCorpusReader(corpus_root, '.*')
    result={}
    for fileid in newcorpus.fileids():
        for file2 in newcorpus.fileids():
            result= {"f1": fileid, "f2":file2, "value": compare_texts(newcorpus.raw(fileid), newcorpus.raw(file2))}
            docs.insert_one(result).inserted_id
开发者ID:jorpramo,项目名称:TAKESI_LOADER,代码行数:14,代码来源:prueba.py

示例15: extractexternalURL

# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
 def extractexternalURL(self, article):
     #corpus_root = '/home/jesal/onedump/'
     wl = PlaintextCorpusReader(corpus_root, '.*')
     #tempww = wl.words(fileids = article)
     articlename = article + '.txt'
     rawopen = wl.raw(articlename)
     lines = rawopen.splitlines()
     txt = rawopen
     listfinal = []
     #rg = re.compile('http..(?:\\/[\\w\\.\\-]+)+',re.IGNORECASE|re.DOTALL)
     listfinal = re.findall(rg,rawopen)
     return listfinal
开发者ID:Wiki-G,项目名称:wikiG-app,代码行数:14,代码来源:testImplemented.py


注:本文中的nltk.corpus.PlaintextCorpusReader.raw方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。