解析JS渲染页面


import requests
from lxml import etree
import re

url = "https://www.mohrss.gov.cn/xxgk2020/zcwjk/"
response = requests.get(url)
response.encoding = response.apparent_encoding
html = response.text
data = etree.HTML(html)
a_tags = data.xpath("//a[@onclick]")
for a in a_tags:
   onclick_attr = a.xpath("@onclick")[0] if a.xpath("@onclick") else ""
   match = re.search(r"getClassify\((\d+),", onclick_attr)
   if match:
       id_value = match.group(1)
       print(f"ID: {id_value}+{text}")

模糊匹配


chapter_xpath = self.data.xpath("//body/div/div/h1")[0]
chapter_name = chapter_xpath.xpath("string(.)")
print(chapter_name)

直接提取值

解析：

关键代码：a_tags = data.xpath("//a[@onclick]") 匹配所有a标签的onclick属性

取到的数据是一个列表，遍历可以模糊匹配

输出text需要用a.xpath("string()") 输出

模糊匹配后需要用match.group(1) 提取出值