"""
@description:爬取B站视频和音频,并合成(貌似不需要),现在可爬电影和短视频
@path:视频、音频路径:process_video;合成后路径:finally_video
@class:videoSpider
"""
from wyw.bilibili音视频爬取.videoSpider import VideoSpider
from selenium import webdriver
def get_cookie(url):
user_data_dir = r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data"
user_option = webdriver.ChromeOptions()
user_option.add_argument("--headless")
user_option.add_argument(f'--user-data-dir={user_data_dir}')
# 初始化浏览器驱动
driver = webdriver.Chrome(options=user_option)
# 打开网页
driver.get(url=url)
# 获取浏览器Cookies
cookies = driver.get_cookies()
# 打印Cookies
cookie_str = "; ".join([f"{cookie['name']}={cookie['value']}" for cookie in cookies])
# 关闭浏览器
driver.quit()
return cookie_str
if __name__ == '__main__':
# 测试数据:https://www.bilibili.com/bangumi/play/ss3901、https://www.bilibili.com/bangumi/play/ep1495640
url = 'https://www.bilibili.com/bangumi/play/ss3901'
cookie = get_cookie(url)
headers = {
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36",
"Cookie": cookie
}
video_spider = VideoSpider(url=url, headers=headers)
# 调用函数
video_spider.check_video_type()
# video_spider.video_compose() # 音画合成
bilibiliSpider.py"""
@description:接bilibiliSpider.py
"""
import requests
import re
import os
import json
import subprocess
from selenium import webdriver
import time
class VideoSpider:
def __init__(self, url, headers):
self.title = one # 合成的时候才有用
self.url = url
self.headers = headers
self.html = requests.get(url, headers=self.headers).text
def video_spider(self):
# 这里解析短视频
try:
# 解析数据:提取视频标题
title = re.findall('title="(.*?)"', self.html)[0]
if len(title) != 0:
pass
else:
now_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
title = f"暂无标题-{now_time}"
self.title = title
print(f"视频标题: {title}")
# 解析视频信息
info = re.findall('window.__playinfo__=(.*?)</script>', self.html)[0]
json_data = json.loads(info)
# 提取视频链接和音频链接
video_url = json_data['data']['dash']['video'][0]['baseUrl']
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
print(f"视频链接: {video_url}")
print(f"音频链接: {audio_url}")
# 下载视频内容
video_content = requests.get(url=video_url, headers=self.headers).content
# 下载音频内容
audio_content = requests.get(url=audio_url, headers=self.headers).content
# 创建文件夹
if not os.path.exists(f'process_video\\{title}'):
os.makedirs(f'process_video\\{title}')
# 保存视频和音频数据
with open(f'process_video\\{title}\\{title}.mp4', mode='wb') as v:
v.write(video_content)
with open(f'process_video\\{title}\\{title}.mp3', mode='wb') as a:
a.write(audio_content)
print("视频音频下载完成!")
except requests.exceptions.RequestException as e:
print(f"请求错误: {e}")
except json.JSONDecodeError:
print("解析JSON时出错")
except Exception as e:
print(f"发生错误: {e}")
def movie_spider(self):
# 这里解析电影
try:
title = re.findall('property="og:title" content="(.*?)"*/?>', self.html)[0]
if len(title) != 0:
pass
else:
now_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
title = f"暂无标题-{now_time}"
self.title = title
print("视频标题为:", title)
video_info = re.findall('"video_info":(.*?),"view_info"', self.html)
json_data = json.loads(video_info[0])
try:
# 解析视频,VIP
movie_url = json_data["durl"][0]['url'] # 获取视频的链接
except Exception as e:
# 解析视频,非VIP
movie_url = json_data["dash"]["video"][0]["base_url"]
# 下载视频内容
print("视频链接:", movie_url)
print("视频下载中~~~")
movie_content = requests.get(url=movie_url, headers=self.headers).content
# 创建文件夹
if not os.path.exists(f'process_video\\{title}'):
os.makedirs(f'process_video\\{title}')
with open(f'process_video\\{title}\\{title}.mp4', mode='wb') as m:
m.write(movie_content)
print("✅ 视频下载完成!")
except requests.exceptions.RequestException as e:
print(f"请求错误: {e}")
except json.JSONDecodeError:
print("解析JSON时出错")
except Exception as e:
print(f"发生错误: {e}")
def check_video_type(self):
# 检查当前视频类型调用不同的解析
matches = re.findall(r'property="og:type" content="(video[^"]*)"*/?>', self.html)
if matches == "video":
print("短视频")
# 这一块不是很确定,测试数据是可以这样实现的
self.video_spider()
else:
print("电影")
self.movie_spider()
def video_compose(self):
if self.title:
print("开始合并视频...")
if not os.path.exists(f'finally_video\\{self.title}'):
os.makedirs(f'finally_video\\{self.title}')
# 合并成完整的视频内容
cmd = f'ffmpeg -hide_banner -i "process_video\\{self.title}\\{self.title}.mp4" -i "process_video\\{self.title}\\{self.title}.mp3" -c:v copy -c:a aac -strict experimental "finally_video\\{self.title}\\{self.title}.mp4"'
# 调用命令
subprocess.run(cmd)
else:
print("title is None")
subprocess.run(cmd)
else:
print("title isNon")
videoSpider.py总结
- 登录后才能爬取到高码率的视频,因此get请求需要调用cookie
- 爬取下来的视频可能不支持预览,可能原因是播放器解码失败,选用KMPlayer播放视频
- 现如今只适配了电影和短视频,只保证了测试数据能正常运行,关键的问题还是在
check_video_type
这一个函数上面