我用Python写了个爬取链接图片视频的脚本

梦泽 2025-01-19 PM 9℃ 0条

先来导入一些库

之前有人抱怨文章太死板了,后面我就用这个风格写文章了,因为第一次换风格,有什么妥当的评论区说出来吧。

一开始得先安装并导入一些需要的库,没啥好说的,都是爬虫开发必备的工具。

import requests
from bs4 import BeautifulSoup
import pyfiglet
import random
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from colorama import init, Fore
  • requests:用来请求网页。
  • BeautifulSoup:解析网页 HTML,找图片和视频链接。
  • pyfiglet:用来生成艺术字,输出个好看的标题。
  • fake_useragent:生成随机浏览器的User-Agent,避免被封。
  • colorama:给终端输出加点颜色,看起来酷一点。

给爬虫加点艺术

代码一开始,就让它有点“气质”,用 pyfiglet 给个大大的欢迎字。起码能让人眼前一亮。

init(autoreset=True)

art = pyfiglet.figlet_format("ZeTooL-Img")
print(art)

输出啥样的字就看这个设置了,起码能让人眼前一亮。

用户输入 URL

接下来让用户输入一个想要爬取的链接。这个链接就是爬虫的目标网页。

url = input("请输入要爬取的链接: ")

模拟浏览器请求

为了避免爬虫被封,伪装一下。用 fake_useragent 随机生成一个浏览器的 User-Agent,还有一个随机的 IP 地址。这样伪装成不同的浏览器、不同地区的用户,能增加不被封的几率。

systems = [
    "Windows NT 6.1; Win64; x64",  # Windows 7
    "Windows NT 10.0; Win64; x64",  # Windows 10
    "Android 8.0.0; Pixel 2",      # Android 8
    "iOS 16.0; iPhone 13"          # iOS 16
]
system = random.choice(systems)

# 随机生成一个中国IP
ip = f"223.104.{random.randint(0, 255)}.{random.randint(0, 255)}"
ua = UserAgent()
user_agent = ua.random  # 随机生成一个User-Agent

# 请求头设置
headers = {
    "User-Agent": user_agent,
    "X-Forwarded-For": ip,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}

有了这些伪装,爬虫才不容易被发现。

设置请求重试机制

网络不好或者网站服务器有问题,可能会导致请求失败。为了避免脚本崩溃,加个重试机制。如果请求失败,会自动再试 3 次,避免一不小心就把程序搞崩。

# 请求重试机制
retry_strategy = Retry(
    total=3,  # 最多重试3次
    backoff_factor=1,  # 每次重试间隔会越来越长
    status_forcelist=[500, 502, 503, 504, 404, 403],  # 这些错误会触发重试
    allowed_methods=["HEAD", "GET", "OPTIONS"]  # 仅对GET、HEAD、OPTIONS方法重试
)

session = requests.Session()
session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))

定义爬取网页的函数

写个函数来发请求、获取网页。出现错误的时候,显示一些友好的错误信息。

def fetch_page(url):
    try:
        response = session.get(url, headers=headers, timeout=10)  # 请求超时设置为10秒
        response.raise_for_status()  # 如果网页返回错误码,抛出异常
        return response
    except requests.exceptions.Timeout:
        print(Fore.RED + "错误: 请求超时,请检查网络连接或稍后重试。")
        return None
    except requests.exceptions.TooManyRedirects:
        print(Fore.RED + "错误: 请求的URL出现重定向过多的情况。")
        return None
    except requests.exceptions.HTTPError as http_err:
        print(Fore.RED + f"HTTP 错误: {http_err}")
        return None
    except requests.exceptions.RequestException as e:
        print(Fore.RED + f"请求出错: {e}")
        return None

这样能确保网页请求成功才会继续执行后续操作。

解析网页,抓取图片和视频链接

用 BeautifulSoup 来解析 HTML 页面,抓取所有<img><video>标签里的src属性。如果是相对路径,就拼成完整的 URL。

soup = BeautifulSoup(response.text, 'html.parser')

image_links = []  # 存图片链接
video_links = []  # 存视频链接

# 提取图片链接
for img_tag in soup.find_all('img'):
    img_src = img_tag.get('src')
    if img_src:
        if not img_src.startswith('http'):  # 拼接相对路径
            img_src = url + img_src
        image_links.append(img_src)
        print(Fore.GREEN + f"[Fetch Log] Image Link: {img_src}")

# 提取视频链接
for video_tag in soup.find_all('video'):
    video_src = video_tag.get('src')
    if video_src:
        if not video_src.startswith('http'):  # 拼接相对路径
            video_src = url + video_src
        video_links.append(video_src)
        print(Fore.GREEN + f"[Fetch Log] Video Link: {video_src}")

保存链接到文件

把抓取到的图片和视频链接保存到 media_links.txt 文件中。保存格式简单清晰,后面可以方便地处理。

with open('media_links.txt', 'w', encoding='utf-8') as file:
    file.write('--- Image Links ---\n')
    for img_link in image_links:
        file.write(img_link + '\n')
    
    file.write('\n--- Video Links ---\n')
    for video_link in video_links:
        file.write(video_link + '\n')

print(Fore.GREEN + '链接已经保存到 media_links.txt 文件中。')

附上代码

import requests
from bs4 import BeautifulSoup
import pyfiglet
import random
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from colorama import init, Fore

# 佛祖保佑,永无bug
#      ____  
#     /    \ 
#    /      \
#   |        |
#   |  O  O  |  佛祖保佑
#   |    ^   |
#   |   ---  |  
#    \______/
#   /        \
#  /__________\
#    |      |
#    |      |
#    |      |
#   (__)  (__)

init(autoreset=True)

art = pyfiglet.figlet_format("ZeTooL-Img")
print(art)

url = input("请输入要爬取的链接: ")

systems = [
    "Windows NT 6.1; Win64; x64",
    "Windows NT 10.0; Win64; x64",
    "Windows NT 6.3; Win64; x64",
    "Android 8.0.0; Pixel 2",
    "Android 9.0.0; Pixel 3",
    "Android 10.0; Pixel 4",
    "iOS 16.0; iPhone 13",
    "iOS 17.0; iPhone 14",
    "iOS 18.0; iPhone 15"
]
system = random.choice(systems)

ip = f"223.104.{random.randint(0, 255)}.{random.randint(0, 255)}"
ua = UserAgent()
user_agent = ua.random

headers = {
    "User-Agent": user_agent,
    "X-Forwarded-For": ip,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}

retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504, 404, 403],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)

session = requests.Session()
session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))

def fetch_page(url):
    try:
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response
    except requests.exceptions.Timeout:
        print(Fore.RED + "错误: 请求超时,请检查网络连接或稍后重试。")
        return None
    except requests.exceptions.TooManyRedirects:
        print(Fore.RED + "错误: 请求的URL出现重定向过多的情况。")
        return None
    except requests.exceptions.HTTPError as http_err:
        if response.status_code == 404:
            print(Fore.RED + "错误: 找不到页面 (404)。请检查URL是否正确。")
        elif response.status_code == 403:
            print(Fore.RED + "错误: 权限不足 (403)。访问被拒绝。")
        else:
            print(Fore.RED + f"HTTP 错误: {http_err}")
        return None
    except requests.exceptions.RequestException as e:
        print(Fore.RED + f"请求出错: {e}")
        return None

response = fetch_page(url)
if response is None:
    print(Fore.RED + "无法爬取该链接,请检查URL或稍后再试。")
else:
    soup = BeautifulSoup(response.text, 'html.parser')

    image_links = []
    video_links = []

    for img_tag in soup.find_all('img'):
        img_src = img_tag.get('src')
        if img_src:
            if not img_src.startswith('http'):
                img_src = url + img_src
            image_links.append(img_src)
            print(Fore.GREEN + f"[Fetch Log] Image Link: {img_src}")

    for video_tag in soup.find_all('video'):
        video_src = video_tag.get('src')
        if video_src:
            if not video_src.startswith('http'):
                video_src = url + video_src
            video_links.append(video_src)
            print(Fore.GREEN + f"[Fetch Log] Video Link: {video_src}")
        
        for source_tag in video_tag.find_all('source'):
            source_src = source_tag.get('src')
            if source_src:
                if not source_src.startswith('http'):
                    source_src = url + source_src
                video_links.append(source_src)
                print(Fore.GREEN + f"[Fetch Log] Video Link: {source_src}")

    with open('media_links.txt', 'w', encoding='utf-8') as file:
        file.write('--- Image Links ---\n')
        for img_link in image_links:
            file.write(img_link + '\n')
        
        file.write('\n--- Video Links ---\n')
        for video_link in video_links:
            file.write(video_link + '\n')

    print(Fore.GREEN + '链接已经保存到 media_links.txt 文件中。')
标签: Python, 爬虫, 爬取网页

非特殊说明,本博所有文章均为博主原创。

评论啦~