删除 爬虫.py
This commit is contained in:
34
爬虫.py
34
爬虫.py
@ -1,34 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import time
|
||||
import random
|
||||
|
||||
# 设置目标 URL 和请求头
|
||||
base_url = "https://news.ycombinator.com/"
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
# 存储数据
|
||||
titles = []
|
||||
links = []
|
||||
|
||||
# 爬取内容
|
||||
for page in range(1, 4): # 抓取前三页
|
||||
url = f"{base_url}?p={page}"
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
for item in soup.find_all("a", class_="titlelink"):
|
||||
titles.append(item.text)
|
||||
links.append(item["href"])
|
||||
|
||||
print(f"完成第 {page} 页爬取")
|
||||
time.sleep(random.uniform(1, 3)) # 随机延迟
|
||||
|
||||
# 保存数据到 CSV
|
||||
data = {"Title": titles, "Link": links}
|
||||
df = pd.DataFrame(data)
|
||||
df.to_csv("news.csv", index=False, encoding="utf-8-sig")
|
||||
print("数据已保存到 news.csv")
|
||||
Reference in New Issue
Block a user