BeautifulSoup4の基本 • barorin&?

はじめに

BeautifulSoup4の基本的な使い方をまとめてみます。

内容

基本のキ

import requests
from bs4 import BeautifulSoup

# htmlの取得
res = requests.get('https://www.hoge.com/')

# ローカルに保存
with open('res.html', 'w') as file:
    file.write(res.text)

# htmlパース
soup = BeautifulSoup(res.text, 'html.parser')

# ローカルのhtmlファイルからパースする場合
soup = BeautifulSoup(open('res.html'), 'html.parser')

selenium と組み合わせて使う

from urllib.parse import urljoin
import re

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# driverのオプション設定
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
driver.implicitly_wait(10)

# 基本設定
base_url = 'https://www.hoge.com/'
driver.get(base_url)
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'html.parser')

# 特定の文字を取得
titles = [title.get_text(strip=True) for title in soup.find_all('p', class_='クラス名')]

# リンク先の取得
attrs = {'href': re.compile(r'/home.\*')} # /homeで始まるURLだけ抜き出す設定
links = [urljoin(base_url, url.get('href')) for url in soup.find_all('a', attrs=attrs)]