import requests from lxml import etree import re url = "https://www.mohrss.gov.cn/xxgk2020/zcwjk/" response = requests.get(url) response.encoding = response.apparent_encoding html = response.text data = etree.HTML(html) a_tags = data.xpath("//a[@onclick]") for a in a_tags: onclick_attr = a.xpath("@onclick")[0] if a.xpath("@onclick") else "" match = re.search(r"getClassify\((\d+),", onclick_attr) if match: id_value = match.group(1) print(f"ID: {id_value}+{text}")
chapter_xpath = self.data.xpath("//body/div/div/h1")[0] chapter_name = chapter_xpath.xpath("string(.)") print(chapter_name)
解析:
- 关键代码:
a_tags = data.xpath("//a[@onclick]")
匹配所有a标签的onclick属性
- 取到的数据是一个列表,遍历可以模糊匹配
- 输出text需要用
a.xpath("string()")
输出
- 模糊匹配后需要用
match.group(1)
提取出值