地图服务爬取
import osimport requests
import json
from urllib.parse import quote
import re
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "http://bzdt.ch.mnr.gov.cn",
"Referer": "http://bzdt.ch.mnr.gov.cn/download.html?superclassName=%25E4%25B8%25AD%25E5%259B%25BD%25E5%2585%25A8%25E5%259B%25BE",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
"X-Requested-With": "XMLHttpRequest"
}
cookies = {
"JSESSIONID": "2A21F504B0E01929D61553D9090C88DC",
"HWWAFSESTIME": "1748420564265",
"HWWAFSESID": "4479e79724b249eda0"
}
url = "http://bzdt.ch.mnr.gov.cn/sbsm/supermap/searchPicture.do"
# 创建目录
os.makedirs("map", exist_ok=True)
# 清洗非法文件名字符
def sanitize_filename(name):
return re.sub(r'[\\/*?:"<>|]', "", name)
for item in range(1, 14):
data = {
"seachText": "",
"scale": "",
"size": "",
"pageNum": item,
"orderSearch": "id",
"superclass": "中国全图",
"subclass": "",
"smallclass": "",
"border": "",
"largeclass": "",
"mapYear": "",
"neighbouringCountry": "",
"provinceColor": "",
"illustration": ""
}
response = requests.post(url, headers=headers, cookies=cookies, data=data, verify=False)
result_data = json.loads(response.text)
results = result_data["message"]["result"]
for item_data in results:
name = item_data.get("name", "")
size = item_data.get("size", "")
content = item_data.get("content", "")
id_ = item_data.get("id", "")
# 构造文件名
safe_name = sanitize_filename(name)
safe_content = sanitize_filename(content)
file_name = f"map/{safe_name}_{safe_content}.zip"
# 拼接下载链接
encoded_size = quote(size)
down_url = f"https://bzdt-sbsm.obs.cn-north-4.myhuaweicloud.com/prototype/{encoded_size}/{id_}a.zip"
print(f"正在下载: {down_url}")
try:
with requests.get(down_url, stream=True, timeout=30) as r:
r.raise_for_status()# 如果状态码不是 200,抛出异常
with open(file_name, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):# 流式下载
if chunk:
f.write(chunk)
print(f"已保存至: {file_name}")
except Exception as e:
print(f"下载失败: {down_url}, 错误: {e}")
页:
[1]