From 23fe0488cd151713812218848b474e8560ee3fb2 Mon Sep 17 00:00:00 2001 From: xt66642 <136733984@qq.com> Date: Mon, 20 Jan 2025 17:08:21 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pachong.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 pachong.py diff --git a/pachong.py b/pachong.py new file mode 100644 index 0000000..4cc365b --- /dev/null +++ b/pachong.py @@ -0,0 +1,51 @@ +import requests +from bs4 import BeautifulSoup +import time +import random + +def pachong(url): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + } + + try: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + soup = BeautifulSoup(response.text, "html.parser") + + # 提取所有链接 + links = [] + for a_tag in soup.find_all("a", href=True): + link = a_tag["href"] + links.append(link) + + print(f"从 {url} 爬取到 {len(links)} 个链接。") + return links + + except requests.RequestException as e: + print(f"无法访问 {url},错误:{e}") + return [] + +def main(): + # 用户输入多个网站的 URL + urls = input("请输入多个网站的 URL,用逗号分隔:").strip().split(",") + + for url in urls: + url = url.strip() # 去除多余的空格 + if url: # 确保 URL 不为空 + print(f"正在爬取 {url}...") + links = pachong(url) + + # 为每个网站创建一个独立的文件 + filename = f"{url.replace('https://', '').replace('www.', '').replace('/', '_')}.txt" + with open(filename, "w", encoding="utf-8") as file: + for link in links: + file.write(link + "\n") + + print(f"链接已保存到 {filename}") + time.sleep(random.uniform(1, 3)) # 随机延迟,避免对服务器造成过大压力 + + print("所有网站的链接已分别保存到对应的文件中。") + +if __name__ == "__main__": + main()