import requests from lxml import etree import re url = "https://www.mohrss.gov.cn/xxgk2020/zcwjk/" response = requests.get(url) response.encoding = response.apparent_encoding html = response.text data = etree.HTML(html) # p_list = data.xpath('//div/ul/li/a') a_tags = data.xpath("//a[@onclick]") for a in a_tags: onclick_attr = a.xpath("@onclick")[0] if a.xpath("@onclick") else "" match = re.search(r"getClassify\((\d+),", onclick_attr) text = a.xpath("string()") if match: id_value = match.group(1) print(f"ID: {id_value}+{text}") else: print(f"text: {text}")
解析:
- 关键代码:
a_tags = data.xpath("//a[@onclick]")
匹配所有a标签的onclick属性
- 取到的数据是一个列表,遍历可以模糊匹配
- 输出text需要用
a.xpath("string()")
输出
- 模糊匹配后需要用
match.group(1)
提取出值