自然言語処理の前処理メモ • barorin&?

はじめに

自然言語処理の前処理で役立つメモです。
df はこのようなイメージです。

方法

id	title	text
0	hoge	I am so happy.
1	fuga	NaN
2	test	Hello World!!
3	yo	He is 13 years old.

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd

df = pd.read_csv('df.csv', header=0)

# NULLを埋める
df['text'].fillna('NULL', inplace=True)

# titleとtextを結合してconcat列を作成。NULLはスペースに置換
df['concat'] = df['title'] + ' ' + df['text']
df['concat'].replace('NULL', '', inplace=True)

# id列、title列、text列の削除
df = df.drop(['id', 'title', 'text'], axis=1)

# 小文字にする
df['concat'] = df['concat'].str.lower()

# 数字は全てゼロにする
df['concat'] = df['concat'].replace('\d', '0', regex=True)

# 英数字と半角スペース以外を削除
train_df['concat'] = train_df['concat'].str.replace('[^\w\s]','', regex=True)

# ストップワード除去
stop_words= stopwords.words('english')
def remove\_stopwords(words):
    words_nostop= ' '.join([word for word in words.split() if word not in stop_words])
    return words_nostop

df['concat\_nostop'] = df['concat'].apply(lambda x: remove_stopwords(x))