地图服务爬取

Rallan 发表于 2025-6-17 15:28:53

import os
import requests
import json
from urllib.parse import quote
import re

headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "http://bzdt.ch.mnr.gov.cn",
"Referer": "http://bzdt.ch.mnr.gov.cn/download.html?superclassName=%25E4%25B8%25AD%25E5%259B%25BD%25E5%2585%25A8%25E5%259B%25BE",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
"X-Requested-With": "XMLHttpRequest"
}
cookies = {
"JSESSIONID": "2A21F504B0E01929D61553D9090C88DC",
"HWWAFSESTIME": "1748420564265",
"HWWAFSESID": "4479e79724b249eda0"
}
url = "http://bzdt.ch.mnr.gov.cn/sbsm/supermap/searchPicture.do"

# 创建目录
os.makedirs("map", exist_ok=True)

# 清洗非法文件名字符
def sanitize_filename(name):
return re.sub(r'[\\/*?:"<>|]', "", name)

for item in range(1, 14):
data = {
   "seachText": "",
   "scale": "",
   "size": "",
   "pageNum": item,
   "orderSearch": "id",
   "superclass": "中国全图",
   "subclass": "",
   "smallclass": "",
   "border": "",
   "largeclass": "",
   "mapYear": "",
   "neighbouringCountry": "",
   "provinceColor": "",
   "illustration": ""
}
response = requests.post(url, headers=headers, cookies=cookies, data=data, verify=False)
result_data = json.loads(response.text)
results = result_data["message"]["result"]

for item_data in results:
   name = item_data.get("name", "")
   size = item_data.get("size", "")
   content = item_data.get("content", "")
   id_ = item_data.get("id", "")

   # 构造文件名
   safe_name = sanitize_filename(name)
   safe_content = sanitize_filename(content)
   file_name = f"map/{safe_name}_{safe_content}.zip"

   # 拼接下载链接
   encoded_size = quote(size)
   down_url = f"https://bzdt-sbsm.obs.cn-north-4.myhuaweicloud.com/prototype/{encoded_size}/{id_}a.zip"

   print(f"正在下载: {down_url}")

   try:
         with requests.get(down_url, stream=True, timeout=30) as r:
            r.raise_for_status()# 如果状态码不是 200，抛出异常
            with open(file_name, "wb") as f:
               for chunk in r.iter_content(chunk_size=1024 * 1024):# 流式下载
                     if chunk:
                        f.write(chunk)
         print(f"已保存至: {file_name}")
   except Exception as e:
         print(f"下载失败: {down_url}, 错误: {e}")

页: [1]

我爱it学习's Archiver

地图服务爬取