爬取B站视频


"""
@description:爬取B站视频和音频，并合成（貌似不需要），现在可爬电影和短视频
@path:视频、音频路径：process_video;合成后路径：finally_video
@class:videoSpider
"""
from wyw.bilibili音视频爬取.videoSpider import VideoSpider
from selenium import webdriver


def get_cookie(url):
    user_data_dir = r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data"
    user_option = webdriver.ChromeOptions()
    user_option.add_argument("--headless")
    user_option.add_argument(f'--user-data-dir={user_data_dir}')
    # 初始化浏览器驱动
    driver = webdriver.Chrome(options=user_option)
    # 打开网页
    driver.get(url=url)
    # 获取浏览器Cookies
    cookies = driver.get_cookies()
    # 打印Cookies
    cookie_str = "; ".join([f"{cookie['name']}={cookie['value']}" for cookie in cookies])
    # 关闭浏览器
    driver.quit()
    return cookie_str


if __name__ == '__main__':
    # 测试数据：https://www.bilibili.com/bangumi/play/ss3901、https://www.bilibili.com/bangumi/play/ep1495640
    url = 'https://www.bilibili.com/bangumi/play/ss3901'
    cookie = get_cookie(url)
    headers = {
        "Referer": url,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/124.0.0.0 Safari/537.36",
        "Cookie": cookie
    }
    video_spider = VideoSpider(url=url, headers=headers)
    # 调用函数
    video_spider.check_video_type()
    # video_spider.video_compose()  # 音画合成

bilibiliSpider.py


"""
@description:接bilibiliSpider.py
"""
import requests
import re
import os
import json
import subprocess
from selenium import webdriver
import time


class VideoSpider:
    def __init__(self, url, headers):
        self.title = one  # 合成的时候才有用
        self.url = url
        self.headers = headers
        self.html = requests.get(url, headers=self.headers).text

    def video_spider(self):
        # 这里解析短视频
        try:
            # 解析数据：提取视频标题
            title = re.findall('title="(.*?)"', self.html)[0]
            if len(title) != 0:
                pass
            else:
                now_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
                title = f"暂无标题-{now_time}"
            self.title = title
            print(f"视频标题: {title}")
            # 解析视频信息
            info = re.findall('window.__playinfo__=(.*?)</script>', self.html)[0]
            json_data = json.loads(info)

            # 提取视频链接和音频链接
            video_url = json_data['data']['dash']['video'][0]['baseUrl']
            audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
            print(f"视频链接: {video_url}")
            print(f"音频链接: {audio_url}")

            # 下载视频内容
            video_content = requests.get(url=video_url, headers=self.headers).content
            # 下载音频内容
            audio_content = requests.get(url=audio_url, headers=self.headers).content

            # 创建文件夹
            if not os.path.exists(f'process_video\\{title}'):
                os.makedirs(f'process_video\\{title}')

            # 保存视频和音频数据
            with open(f'process_video\\{title}\\{title}.mp4', mode='wb') as v:
                v.write(video_content)
            with open(f'process_video\\{title}\\{title}.mp3', mode='wb') as a:
                a.write(audio_content)

            print("视频音频下载完成！")

        except requests.exceptions.RequestException as e:
            print(f"请求错误: {e}")
        except json.JSONDecodeError:
            print("解析JSON时出错")
        except Exception as e:
            print(f"发生错误: {e}")

    def movie_spider(self):
        # 这里解析电影
        try:
            title = re.findall('property="og:title" content="(.*?)"*/?>', self.html)[0]
            if len(title) != 0:
                pass
            else:
                now_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
                title = f"暂无标题-{now_time}"
            self.title = title
            print("视频标题为：", title)
            video_info = re.findall('"video_info":(.*?),"view_info"', self.html)
            json_data = json.loads(video_info[0])
            try:
                # 解析视频，VIP
                movie_url = json_data["durl"][0]['url']  # 获取视频的链接
            except Exception as e:
                # 解析视频，非VIP
                movie_url = json_data["dash"]["video"][0]["base_url"]

            # 下载视频内容
            print("视频链接：", movie_url)
            print("视频下载中~~~")
            movie_content = requests.get(url=movie_url, headers=self.headers).content
            # 创建文件夹
            if not os.path.exists(f'process_video\\{title}'):
                os.makedirs(f'process_video\\{title}')
            with open(f'process_video\\{title}\\{title}.mp4', mode='wb') as m:
                m.write(movie_content)
            print("✅ 视频下载完成！")

        except requests.exceptions.RequestException as e:
            print(f"请求错误: {e}")
        except json.JSONDecodeError:
            print("解析JSON时出错")
        except Exception as e:
            print(f"发生错误: {e}")

    def check_video_type(self):
        # 检查当前视频类型调用不同的解析
        matches = re.findall(r'property="og:type" content="(video[^"]*)"*/?>', self.html)
        if matches == "video":
            print("短视频")
            # 这一块不是很确定，测试数据是可以这样实现的
            self.video_spider()
        else:
            print("电影")
            self.movie_spider()

    def video_compose(self):
        if self.title:
            print("开始合并视频...")
            if not os.path.exists(f'finally_video\\{self.title}'):
                os.makedirs(f'finally_video\\{self.title}')
            # 合并成完整的视频内容
            cmd = f'ffmpeg -hide_banner -i "process_video\\{self.title}\\{self.title}.mp4" -i "process_video\\{self.title}\\{self.title}.mp3" -c:v copy -c:a aac -strict experimental "finally_video\\{self.title}\\{self.title}.mp4"'
            # 调用命令
            subprocess.run(cmd)
        else:
            print("title is None")
      subprocess.run(cmd)
        else:
            print("title isNon")

videoSpider.py

总结

登录后才能爬取到高码率的视频，因此get请求需要调用cookie

需要获取登录状态的cookie才有效

爬取下来的视频可能不支持预览，可能原因是播放器解码失败，选用KMPlayer播放视频

ffmpeg的视频合成效率远高于moviepy

现如今只适配了电影和短视频，只保证了测试数据能正常运行，关键的问题还是在 check_video_type 这一个函数上面