Google ColaboratoryでWordCloudを使ってみる • barorin&?

はじめに

Google ColaboratoryでWordCloudを使う方法です。

方法

色々インストール

# リストから昇順で取り出すライブラリ
!pip install natsort

# MeCabインストール
!apt install aptitude swig
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.996.3

# mecab-ipadic-NEologdインストール
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -a

# 日本語フォントインストール
!apt install -y fonts-ipafont-gothic

ライブラリのインポート等

import glob
import os
import subprocess

import MeCab
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from natsort import natsorted

# MeCab設定
cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
path = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                        shell=True).communicate()[0]).decode('utf-8')

# # Googleドライブをマウントしたい場合
# from google.colab import drive
# drive.mount('/content/drive')

WordCloud を作成する関数

def make\_wc(file_path, stopwords=None):
    '''WordCloudのpngファイルを作成する。

    Args:
        file\_path(str): テキストのファイルパス
        stopwords(set): ストップワードの集合
    Returns:
        None
    '''
    # 拡張子なしのファイル名を取得
    basename_without_ext = os.path.splitext(os.path.basename(file_path))[0]
    print(basename_without_ext)

    # テキストファイル読み込み
    with open(file_path, mode='rt', encoding='utf-8') as f:
        source_text = f.read()

    # MeCabの準備
    tagger = MeCab.Tagger()
    tagger.parse('')
    node = tagger.parseToNode(source_text)

    # 名詞を取り出す
    word_list = []
    while node:
        word_type = node.feature.split(',')[0]
        if word_type in ['名詞', '動詞', '形容詞', '副詞']:
            word_list.append(node.surface)
        node = node.next

    # リストを文字列に変換
    word_chain = ' '.join(word_list)

    # ワードクラウド作成
    W = WordCloud(
        width=900,
        height=600,
        background_color='white',
        font_path='/usr/share/fonts/truetype/fonts-japanese-gothic.ttf',
        stopwords=stopwords
    ).generate(word_chain)

    plt.imshow(W)
    plt.axis('off')
    plt.show()

    W.to_file(f'./{basename\_without\_ext}\_wordcloud.png')

使ってみる！

# ストップワードファイルを使う場合
with open('./stopwords.txt', mode='rt', encoding='utf-8') as f:
    stopwords = f.read()

files = glob.glob('./\*.txt')
for file in natsorted(files):
    make_wc(file, set(stopwords))