Elec_1 / app.py
ckfrpark's picture
Update app.py
2854fa8 verified
raw
history blame contribute delete
No virus
1 kB
# ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์ž„ํฌํŠธํ•ฉ๋‹ˆ๋‹ค.
import requests
from bs4 import BeautifulSoup
import pandas as pd
# ์›น ํŽ˜์ด์ง€์˜ URL์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
url = '์—ฌ๊ธฐ์—_์ถ”์ถœํ•˜๊ณ ์ž_ํ•˜๋Š”_์›นํŽ˜์ด์ง€์˜_URL์„_์ž…๋ ฅํ•˜์„ธ์š”'
# requests๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์›น ํŽ˜์ด์ง€์˜ ๋‚ด์šฉ์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
response = requests.get(url)
# BeautifulSoup ๊ฐ์ฒด๋ฅผ ์ƒ์„ฑํ•˜์—ฌ HTML์„ ํŒŒ์‹ฑํ•ฉ๋‹ˆ๋‹ค.
soup = BeautifulSoup(response.text, 'html.parser')
# ์›น ํŽ˜์ด์ง€์˜ ํŠน์ • ๋ถ€๋ถ„์„ ์„ ํƒํ•˜์—ฌ ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
# ์˜ˆ์‹œ: ํŽ˜์ด์ง€์˜ ๋ชจ๋“  'p' ํƒœ๊ทธ์— ์žˆ๋Š” ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
# ์‹ค์ œ ์‚ฌ์šฉ ์‚ฌ๋ก€์— ๋งž๊ฒŒ ์„ ํƒ์ž๋ฅผ ์กฐ์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
texts = [p.text for p in soup.find_all('p')]
# ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.
for text in texts:
print(text)
# ์„ ํƒ์ : ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ์—‘์…€ ํŒŒ์ผ๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
df = pd.DataFrame(texts, columns=['Text'])
df.to_excel('extracted_data.xlsx', index=False)