删除爬虫.py

2025-01-20 17:07:57 +08:00
parent d86655b2e4
commit 7e4d3d9ced
1 changed files with 0 additions and 34 deletions
--- a/爬虫.py
+++ b/爬虫.py
@ -1,34 +0,0 @@
 import requests
 from bs4 import BeautifulSoup
 import pandas as pd
 import time
 import random
 # 设置目标 URL 和请求头
 base_url = "https://news.ycombinator.com/"
 headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
 }
 # 存储数据
 titles = []
 links = []
 # 爬取内容
 for page in range(1, 4):  # 抓取前三页
    url = f"{base_url}?p={page}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")
    for item in soup.find_all("a", class_="titlelink"):
        titles.append(item.text)
        links.append(item["href"])
    print(f"完成第 {page} 页爬取")
    time.sleep(random.uniform(1, 3))  # 随机延迟
 # 保存数据到 CSV
 data = {"Title": titles, "Link": links}
 df = pd.DataFrame(data)
 df.to_csv("news.csv", index=False, encoding="utf-8-sig")
 print("数据已保存到 news.csv")