📺

爬取B站视频

""" @description:爬取B站视频和音频,并合成(貌似不需要),现在可爬电影和短视频 @path:视频、音频路径:process_video;合成后路径:finally_video @class:videoSpider """ from wyw.bilibili音视频爬取.videoSpider import VideoSpider from selenium import webdriver def get_cookie(url): user_data_dir = r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data" user_option = webdriver.ChromeOptions() user_option.add_argument("--headless") user_option.add_argument(f'--user-data-dir={user_data_dir}') # 初始化浏览器驱动 driver = webdriver.Chrome(options=user_option) # 打开网页 driver.get(url=url) # 获取浏览器Cookies cookies = driver.get_cookies() # 打印Cookies cookie_str = "; ".join([f"{cookie['name']}={cookie['value']}" for cookie in cookies]) # 关闭浏览器 driver.quit() return cookie_str if __name__ == '__main__': # 测试数据:https://www.bilibili.com/bangumi/play/ss3901、https://www.bilibili.com/bangumi/play/ep1495640 url = 'https://www.bilibili.com/bangumi/play/ss3901' cookie = get_cookie(url) headers = { "Referer": url, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36", "Cookie": cookie } video_spider = VideoSpider(url=url, headers=headers) # 调用函数 video_spider.check_video_type() # video_spider.video_compose() # 音画合成
bilibiliSpider.py
""" @description:接bilibiliSpider.py """ import requests import re import os import json import subprocess from selenium import webdriver import time class VideoSpider: def __init__(self, url, headers): self.title = one # 合成的时候才有用 self.url = url self.headers = headers self.html = requests.get(url, headers=self.headers).text def video_spider(self): # 这里解析短视频 try: # 解析数据:提取视频标题 title = re.findall('title="(.*?)"', self.html)[0] if len(title) != 0: pass else: now_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) title = f"暂无标题-{now_time}" self.title = title print(f"视频标题: {title}") # 解析视频信息 info = re.findall('window.__playinfo__=(.*?)</script>', self.html)[0] json_data = json.loads(info) # 提取视频链接和音频链接 video_url = json_data['data']['dash']['video'][0]['baseUrl'] audio_url = json_data['data']['dash']['audio'][0]['baseUrl'] print(f"视频链接: {video_url}") print(f"音频链接: {audio_url}") # 下载视频内容 video_content = requests.get(url=video_url, headers=self.headers).content # 下载音频内容 audio_content = requests.get(url=audio_url, headers=self.headers).content # 创建文件夹 if not os.path.exists(f'process_video\\{title}'): os.makedirs(f'process_video\\{title}') # 保存视频和音频数据 with open(f'process_video\\{title}\\{title}.mp4', mode='wb') as v: v.write(video_content) with open(f'process_video\\{title}\\{title}.mp3', mode='wb') as a: a.write(audio_content) print("视频音频下载完成!") except requests.exceptions.RequestException as e: print(f"请求错误: {e}") except json.JSONDecodeError: print("解析JSON时出错") except Exception as e: print(f"发生错误: {e}") def movie_spider(self): # 这里解析电影 try: title = re.findall('property="og:title" content="(.*?)"*/?>', self.html)[0] if len(title) != 0: pass else: now_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) title = f"暂无标题-{now_time}" self.title = title print("视频标题为:", title) video_info = re.findall('"video_info":(.*?),"view_info"', self.html) json_data = json.loads(video_info[0]) try: # 解析视频,VIP movie_url = json_data["durl"][0]['url'] # 获取视频的链接 except Exception as e: # 解析视频,非VIP movie_url = json_data["dash"]["video"][0]["base_url"] # 下载视频内容 print("视频链接:", movie_url) print("视频下载中~~~") movie_content = requests.get(url=movie_url, headers=self.headers).content # 创建文件夹 if not os.path.exists(f'process_video\\{title}'): os.makedirs(f'process_video\\{title}') with open(f'process_video\\{title}\\{title}.mp4', mode='wb') as m: m.write(movie_content) print("✅ 视频下载完成!") except requests.exceptions.RequestException as e: print(f"请求错误: {e}") except json.JSONDecodeError: print("解析JSON时出错") except Exception as e: print(f"发生错误: {e}") def check_video_type(self): # 检查当前视频类型调用不同的解析 matches = re.findall(r'property="og:type" content="(video[^"]*)"*/?>', self.html) if matches == "video": print("短视频") # 这一块不是很确定,测试数据是可以这样实现的 self.video_spider() else: print("电影") self.movie_spider() def video_compose(self): if self.title: print("开始合并视频...") if not os.path.exists(f'finally_video\\{self.title}'): os.makedirs(f'finally_video\\{self.title}') # 合并成完整的视频内容 cmd = f'ffmpeg -hide_banner -i "process_video\\{self.title}\\{self.title}.mp4" -i "process_video\\{self.title}\\{self.title}.mp3" -c:v copy -c:a aac -strict experimental "finally_video\\{self.title}\\{self.title}.mp4"' # 调用命令 subprocess.run(cmd) else: print("title is None") subprocess.run(cmd) else: print("title isNon")
videoSpider.py

总结

  • 登录后才能爬取到高码率的视频,因此get请求需要调用cookie
  • 需要获取登录状态的cookie才有效
  • 爬取下来的视频可能不支持预览,可能原因是播放器解码失败,选用KMPlayer播放视频
  • ffmpeg的视频合成效率远高于moviepy
  • 现如今只适配了电影和短视频,只保证了测试数据能正常运行,关键的问题还是在 check_video_type 这一个函数上面