本文整理汇总了Python中gensim.corpora.WikiCorpus方法的典型用法代码示例。如果您正苦于以下问题:Python corpora.WikiCorpus方法的具体用法?Python corpora.WikiCorpus怎么用?Python corpora.WikiCorpus使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora
的用法示例。
在下文中一共展示了corpora.WikiCorpus方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: to_text
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def to_text():
# wiki_corpus = WikiCorpus(config['wiki_raw'], dictionary={})
# texts_num = 0
# with open(config['input_raw'], 'w', encoding='utf-8') as output:
# for text in wiki_corpus.get_texts():
# output.write(' '.join(text) + '\n')
# texts_num += 1
# if texts_num % 10000 == 0:
# logging.info("Parsed %d th articles" % texts_num)
df = pd.read_csv(os.getcwd() + '/data/financenews/news.csv')
title = list(df['Title'].values)
content = list(df['NewsContent'].values)
raw_text = title + content
texts_num = 0
with open(config['input_raw'], 'w', encoding='utf-8') as output:
for text in raw_text:
text = str(text)
output.write(text.strip() + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("Parsed %d th articles" % texts_num)
示例2: wiki_extract
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def wiki_extract(input_file, output_file):
"""wiki下载文件提取文本内容脚本
:param input_file: 原始文件路径
:param output_file: 提取文件路径
:return: None
"""
# 原始文件是否存在
assert Path(input_file).resolve().exists()
# 提取文件路径不存在就新建
output_file_path = Path(output_file).resolve()
output_file_path.parent.mkdir(exist_ok=True)
logger.info("Start extract wiki ..")
wiki = WikiCorpus(input_file, lemmatize=False)
with open(output_file, "w", encoding="utf8") as f:
for i, text in enumerate(wiki.get_texts()):
f.write(" ".join(text) + "\n")
if i % 10000 == 0:
logger.info("Saved %d articles" % i)
logger.info("Finished extract wiki, Saved in %s" % output_file)
示例3: main
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def main():
if len(sys.argv) != 2:
print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
exit()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
texts_num = 0
with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("已處理 %d 篇文章" % texts_num)
示例4: main
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def main():
if len(sys.argv) != 2:
print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
exit()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
texts_num = 0
with open("wiki_texts.txt",'w',encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(' '.join(text) + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("已處理 %d 篇文章" % texts_num)
示例5: zhwiki2chars
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def zhwiki2chars(in_file, out_file):
reg = re.compile(r'^[a-zA-Z]+$')
def _isalpha(string):
return reg.match(string) is not None
i = 0
out = open(out_file, 'w')
wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
for article in wiki.get_texts():
tokens = []
for token in article:
token = token.decode("utf-8").strip()
if _isalpha(token):
continue
tokens.append(" ".join(token)) # divided by character
out.write(" ".join(tokens) + "\n")
i += 1
if i % 10000 == 0:
print("process %d articles" % i)
out.close()
示例6: formatTime
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def formatTime(seconds):
"""
Takes a number of elapsed seconds and returns a string in the format h:mm.
"""
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return "%d:%02d" % (h, m)
# ======== main ========
# Main entry point for the script.
# This little check has to do with the multiprocess module (which is used by
# WikiCorpus). Without it, the code will spawn infinite processes and hang!
示例7: make_corpus
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def make_corpus(in_f, out_f):
"""Convert Wikipedia xml dump file to text corpus"""
output = open(out_f, 'w', encoding = "utf-8")
wiki = WikiCorpus(in_f, tokenizer_func=tokenize, dictionary=Dictionary())
i = 0
for text in wiki.get_texts():
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
i = i + 1
if (i % 10000 == 0):
print('Processed ' + str(i) + ' articles')
output.close()
print('Processing complete!')
示例8: extract_articles_wiki
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import WikiCorpus [as 别名]
def extract_articles_wiki(wiki_raw_path, raw_documents_path, **_):
wiki_corpus = WikiCorpus(wiki_raw_path, lemmatize=False, dictionary={}, tokenizer_func=tokenize, lower=False)
with open(raw_documents_path, 'w') as raw_documents_file:
for text in tqdm(wiki_corpus.get_texts()):
document = ' '.join(text)
raw_documents_file.write(document + '\n')