先来导入一些库
之前有人抱怨文章太死板了,后面我就用这个风格写文章了,因为第一次换风格,有什么妥当的评论区说出来吧。
一开始得先安装并导入一些需要的库,没啥好说的,都是爬虫开发必备的工具。
import requests
from bs4 import BeautifulSoup
import pyfiglet
import random
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from colorama import init, Fore
- requests:用来请求网页。
- BeautifulSoup:解析网页 HTML,找图片和视频链接。
- pyfiglet:用来生成艺术字,输出个好看的标题。
- fake_useragent:生成随机浏览器的User-Agent,避免被封。
- colorama:给终端输出加点颜色,看起来酷一点。
给爬虫加点艺术
代码一开始,就让它有点“气质”,用 pyfiglet 给个大大的欢迎字。起码能让人眼前一亮。
init(autoreset=True)
art = pyfiglet.figlet_format("ZeTooL-Img")
print(art)
输出啥样的字就看这个设置了,起码能让人眼前一亮。
用户输入 URL
接下来让用户输入一个想要爬取的链接。这个链接就是爬虫的目标网页。
url = input("请输入要爬取的链接: ")
模拟浏览器请求
为了避免爬虫被封,伪装一下。用 fake_useragent 随机生成一个浏览器的 User-Agent,还有一个随机的 IP 地址。这样伪装成不同的浏览器、不同地区的用户,能增加不被封的几率。
systems = [
"Windows NT 6.1; Win64; x64", # Windows 7
"Windows NT 10.0; Win64; x64", # Windows 10
"Android 8.0.0; Pixel 2", # Android 8
"iOS 16.0; iPhone 13" # iOS 16
]
system = random.choice(systems)
# 随机生成一个中国IP
ip = f"223.104.{random.randint(0, 255)}.{random.randint(0, 255)}"
ua = UserAgent()
user_agent = ua.random # 随机生成一个User-Agent
# 请求头设置
headers = {
"User-Agent": user_agent,
"X-Forwarded-For": ip,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}
有了这些伪装,爬虫才不容易被发现。
设置请求重试机制
网络不好或者网站服务器有问题,可能会导致请求失败。为了避免脚本崩溃,加个重试机制。如果请求失败,会自动再试 3 次,避免一不小心就把程序搞崩。
# 请求重试机制
retry_strategy = Retry(
total=3, # 最多重试3次
backoff_factor=1, # 每次重试间隔会越来越长
status_forcelist=[500, 502, 503, 504, 404, 403], # 这些错误会触发重试
allowed_methods=["HEAD", "GET", "OPTIONS"] # 仅对GET、HEAD、OPTIONS方法重试
)
session = requests.Session()
session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
定义爬取网页的函数
写个函数来发请求、获取网页。出现错误的时候,显示一些友好的错误信息。
def fetch_page(url):
try:
response = session.get(url, headers=headers, timeout=10) # 请求超时设置为10秒
response.raise_for_status() # 如果网页返回错误码,抛出异常
return response
except requests.exceptions.Timeout:
print(Fore.RED + "错误: 请求超时,请检查网络连接或稍后重试。")
return None
except requests.exceptions.TooManyRedirects:
print(Fore.RED + "错误: 请求的URL出现重定向过多的情况。")
return None
except requests.exceptions.HTTPError as http_err:
print(Fore.RED + f"HTTP 错误: {http_err}")
return None
except requests.exceptions.RequestException as e:
print(Fore.RED + f"请求出错: {e}")
return None
这样能确保网页请求成功才会继续执行后续操作。
解析网页,抓取图片和视频链接
用 BeautifulSoup 来解析 HTML 页面,抓取所有<img>
和<video>
标签里的src
属性。如果是相对路径,就拼成完整的 URL。
soup = BeautifulSoup(response.text, 'html.parser')
image_links = [] # 存图片链接
video_links = [] # 存视频链接
# 提取图片链接
for img_tag in soup.find_all('img'):
img_src = img_tag.get('src')
if img_src:
if not img_src.startswith('http'): # 拼接相对路径
img_src = url + img_src
image_links.append(img_src)
print(Fore.GREEN + f"[Fetch Log] Image Link: {img_src}")
# 提取视频链接
for video_tag in soup.find_all('video'):
video_src = video_tag.get('src')
if video_src:
if not video_src.startswith('http'): # 拼接相对路径
video_src = url + video_src
video_links.append(video_src)
print(Fore.GREEN + f"[Fetch Log] Video Link: {video_src}")
保存链接到文件
把抓取到的图片和视频链接保存到 media_links.txt 文件中。保存格式简单清晰,后面可以方便地处理。
with open('media_links.txt', 'w', encoding='utf-8') as file:
file.write('--- Image Links ---\n')
for img_link in image_links:
file.write(img_link + '\n')
file.write('\n--- Video Links ---\n')
for video_link in video_links:
file.write(video_link + '\n')
print(Fore.GREEN + '链接已经保存到 media_links.txt 文件中。')
附上代码
import requests
from bs4 import BeautifulSoup
import pyfiglet
import random
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from colorama import init, Fore
# 佛祖保佑,永无bug
# ____
# / \
# / \
# | |
# | O O | 佛祖保佑
# | ^ |
# | --- |
# \______/
# / \
# /__________\
# | |
# | |
# | |
# (__) (__)
init(autoreset=True)
art = pyfiglet.figlet_format("ZeTooL-Img")
print(art)
url = input("请输入要爬取的链接: ")
systems = [
"Windows NT 6.1; Win64; x64",
"Windows NT 10.0; Win64; x64",
"Windows NT 6.3; Win64; x64",
"Android 8.0.0; Pixel 2",
"Android 9.0.0; Pixel 3",
"Android 10.0; Pixel 4",
"iOS 16.0; iPhone 13",
"iOS 17.0; iPhone 14",
"iOS 18.0; iPhone 15"
]
system = random.choice(systems)
ip = f"223.104.{random.randint(0, 255)}.{random.randint(0, 255)}"
ua = UserAgent()
user_agent = ua.random
headers = {
"User-Agent": user_agent,
"X-Forwarded-For": ip,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504, 404, 403],
allowed_methods=["HEAD", "GET", "OPTIONS"]
)
session = requests.Session()
session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
def fetch_page(url):
try:
response = session.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response
except requests.exceptions.Timeout:
print(Fore.RED + "错误: 请求超时,请检查网络连接或稍后重试。")
return None
except requests.exceptions.TooManyRedirects:
print(Fore.RED + "错误: 请求的URL出现重定向过多的情况。")
return None
except requests.exceptions.HTTPError as http_err:
if response.status_code == 404:
print(Fore.RED + "错误: 找不到页面 (404)。请检查URL是否正确。")
elif response.status_code == 403:
print(Fore.RED + "错误: 权限不足 (403)。访问被拒绝。")
else:
print(Fore.RED + f"HTTP 错误: {http_err}")
return None
except requests.exceptions.RequestException as e:
print(Fore.RED + f"请求出错: {e}")
return None
response = fetch_page(url)
if response is None:
print(Fore.RED + "无法爬取该链接,请检查URL或稍后再试。")
else:
soup = BeautifulSoup(response.text, 'html.parser')
image_links = []
video_links = []
for img_tag in soup.find_all('img'):
img_src = img_tag.get('src')
if img_src:
if not img_src.startswith('http'):
img_src = url + img_src
image_links.append(img_src)
print(Fore.GREEN + f"[Fetch Log] Image Link: {img_src}")
for video_tag in soup.find_all('video'):
video_src = video_tag.get('src')
if video_src:
if not video_src.startswith('http'):
video_src = url + video_src
video_links.append(video_src)
print(Fore.GREEN + f"[Fetch Log] Video Link: {video_src}")
for source_tag in video_tag.find_all('source'):
source_src = source_tag.get('src')
if source_src:
if not source_src.startswith('http'):
source_src = url + source_src
video_links.append(source_src)
print(Fore.GREEN + f"[Fetch Log] Video Link: {source_src}")
with open('media_links.txt', 'w', encoding='utf-8') as file:
file.write('--- Image Links ---\n')
for img_link in image_links:
file.write(img_link + '\n')
file.write('\n--- Video Links ---\n')
for video_link in video_links:
file.write(video_link + '\n')
print(Fore.GREEN + '链接已经保存到 media_links.txt 文件中。')