File size: 1,002 Bytes
2854fa8
5b573f3
 
2854fa8
5b573f3
2854fa8
 
5b573f3
2854fa8
 
5b573f3
2854fa8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์ž„ํฌํŠธํ•ฉ๋‹ˆ๋‹ค.
import requests
from bs4 import BeautifulSoup
import pandas as pd

# ์›น ํŽ˜์ด์ง€์˜ URL์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
url = '์—ฌ๊ธฐ์—_์ถ”์ถœํ•˜๊ณ ์ž_ํ•˜๋Š”_์›นํŽ˜์ด์ง€์˜_URL์„_์ž…๋ ฅํ•˜์„ธ์š”'

# requests๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์›น ํŽ˜์ด์ง€์˜ ๋‚ด์šฉ์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
response = requests.get(url)

# BeautifulSoup ๊ฐ์ฒด๋ฅผ ์ƒ์„ฑํ•˜์—ฌ HTML์„ ํŒŒ์‹ฑํ•ฉ๋‹ˆ๋‹ค.
soup = BeautifulSoup(response.text, 'html.parser')

# ์›น ํŽ˜์ด์ง€์˜ ํŠน์ • ๋ถ€๋ถ„์„ ์„ ํƒํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
# ์˜ˆ์‹œ: ํŽ˜์ด์ง€์˜ ๋ชจ๋“  'p' ํƒœ๊ทธ์— ์žˆ๋Š” ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
# ์‹ค์ œ ์‚ฌ์šฉ ์‚ฌ๋ก€์— ๋งž๊ฒŒ ์„ ํƒ์ž๋ฅผ ์กฐ์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
texts = [p.text for p in soup.find_all('p')]

# ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
for text in texts:
    print(text)

# ์„ ํƒ์ : ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ์—‘์…€ ํŒŒ์ผ๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
df = pd.DataFrame(texts, columns=['Text'])
df.to_excel('extracted_data.xlsx', index=False)