Python corpora.WikiCorpus方法代碼示例

本文整理匯總了Python中gensim.corpora.WikiCorpus方法的典型用法代碼示例。如果您正苦於以下問題：Python corpora.WikiCorpus方法的具體用法？Python corpora.WikiCorpus怎麽用？Python corpora.WikiCorpus使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim.corpora的用法示例。

在下文中一共展示了corpora.WikiCorpus方法的8個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: to_text

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def to_text():
    # wiki_corpus = WikiCorpus(config['wiki_raw'], dictionary={})
    # texts_num = 0
    # with open(config['input_raw'], 'w', encoding='utf-8') as output:
    #     for text in wiki_corpus.get_texts():
    #         output.write(' '.join(text) + '\n')
    #         texts_num += 1
    #         if texts_num % 10000 == 0:
    #             logging.info("Parsed %d th articles" % texts_num)

    df = pd.read_csv(os.getcwd() + '/data/financenews/news.csv')
    title = list(df['Title'].values)
    content = list(df['NewsContent'].values)
    raw_text = title + content

    texts_num = 0
    with open(config['input_raw'], 'w', encoding='utf-8') as output:
        for text in raw_text:
            text = str(text)
            output.write(text.strip() + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("Parsed %d th articles" % texts_num)

開發者ID:zhengwsh，項目名稱:text-classification，代碼行數:25，代碼來源:train_word2vec.py

示例2: wiki_extract

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def wiki_extract(input_file, output_file):
    """wiki下載文件提取文本內容腳本

    :param input_file: 原始文件路徑
    :param output_file:  提取文件路徑
    :return: None
    """
    # 原始文件是否存在
    assert Path(input_file).resolve().exists()
    # 提取文件路徑不存在就新建
    output_file_path = Path(output_file).resolve()
    output_file_path.parent.mkdir(exist_ok=True)
    logger.info("Start extract wiki ..")
    wiki = WikiCorpus(input_file, lemmatize=False)
    with open(output_file, "w", encoding="utf8") as f:
        for i, text in enumerate(wiki.get_texts()):
            f.write(" ".join(text) + "\n")
            if i % 10000 == 0:
                logger.info("Saved %d articles" % i)
    logger.info("Finished extract wiki, Saved in %s" % output_file)

開發者ID:EvilPsyCHo，項目名稱:TaskBot，代碼行數:22，代碼來源:wiki_extract.py

示例3: main

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(b' '.join(text).decode('utf-8') + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)

開發者ID:zake7749，項目名稱:word2vec-tutorial，代碼行數:18，代碼來源:wiki_to_txt.py

示例4: main

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(' '.join(text) + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)

開發者ID:zake7749，項目名稱:word2vec-tutorial，代碼行數:18，代碼來源:wiki_to_txt.py

示例5: zhwiki2chars

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def zhwiki2chars(in_file, out_file):
    reg = re.compile(r'^[a-zA-Z]+$')

    def _isalpha(string):
        return reg.match(string) is not None

    i = 0
    out = open(out_file, 'w')
    wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
    for article in wiki.get_texts():
        tokens = []
        for token in article:
            token = token.decode("utf-8").strip()
            if _isalpha(token):
                continue
            tokens.append(" ".join(token))  # divided by character
        out.write(" ".join(tokens) + "\n")
        i += 1
        if i % 10000 == 0:
            print("process %d articles" % i)
    out.close()

開發者ID:chantera，項目名稱:blstm-cws，代碼行數:23，代碼來源:preprocess.py

示例6: formatTime

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def formatTime(seconds):
    """
    Takes a number of elapsed seconds and returns a string in the format h:mm.
    """
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "%d:%02d" % (h, m)
 

# ======== main ========
# Main entry point for the script.
# This little check has to do with the multiprocess module (which is used by
# WikiCorpus). Without it, the code will spawn infinite processes and hang!

開發者ID:chrisjmccormick，項目名稱:wiki-sim-search，代碼行數:15，代碼來源:make_wikicorpus.py

示例7: make_corpus

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def make_corpus(in_f, out_f):
    """Convert Wikipedia xml dump file to text corpus"""
    output = open(out_f, 'w', encoding = "utf-8")
    wiki = WikiCorpus(in_f, tokenizer_func=tokenize, dictionary=Dictionary())
    i = 0
    for text in wiki.get_texts():
        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
        i = i + 1
        if (i % 10000 == 0):
            print('Processed ' + str(i) + ' articles')
    output.close()
    print('Processing complete!')

開發者ID:ratsgo，項目名稱:embedding，代碼行數:14，代碼來源:dump.py

示例8: extract_articles_wiki

# 需要導入模塊: from gensim import corpora [as 別名]
# 或者: from gensim.corpora import WikiCorpus [as 別名]
def extract_articles_wiki(wiki_raw_path, raw_documents_path, **_):
    wiki_corpus = WikiCorpus(wiki_raw_path, lemmatize=False, dictionary={}, tokenizer_func=tokenize, lower=False)

    with open(raw_documents_path, 'w') as raw_documents_file:
        for text in tqdm(wiki_corpus.get_texts()):
            document = ' '.join(text)
            raw_documents_file.write(document + '\n')

開發者ID:dreamgonfly，項目名稱:BERT-pytorch，代碼行數:9，代碼來源:preprocess.py

注：本文中的gensim.corpora.WikiCorpus方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。