Rallan 发表于 2025-6-17 15:28:53

地图服务爬取

import os
import requests
import json
from urllib.parse import quote
import re

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Connection": "keep-alive",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "http://bzdt.ch.mnr.gov.cn",
    "Referer": "http://bzdt.ch.mnr.gov.cn/download.html?superclassName=%25E4%25B8%25AD%25E5%259B%25BD%25E5%2585%25A8%25E5%259B%25BE",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
    "X-Requested-With": "XMLHttpRequest"
}
cookies = {
    "JSESSIONID": "2A21F504B0E01929D61553D9090C88DC",
    "HWWAFSESTIME": "1748420564265",
    "HWWAFSESID": "4479e79724b249eda0"
}
url = "http://bzdt.ch.mnr.gov.cn/sbsm/supermap/searchPicture.do"

# 创建目录
os.makedirs("map", exist_ok=True)

# 清洗非法文件名字符
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

for item in range(1, 14):
    data = {
      "seachText": "",
      "scale": "",
      "size": "",
      "pageNum": item,
      "orderSearch": "id",
      "superclass": "中国全图",
      "subclass": "",
      "smallclass": "",
      "border": "",
      "largeclass": "",
      "mapYear": "",
      "neighbouringCountry": "",
      "provinceColor": "",
      "illustration": ""
    }
    response = requests.post(url, headers=headers, cookies=cookies, data=data, verify=False)
    result_data = json.loads(response.text)
    results = result_data["message"]["result"]

    for item_data in results:
      name = item_data.get("name", "")
      size = item_data.get("size", "")
      content = item_data.get("content", "")
      id_ = item_data.get("id", "")

      # 构造文件名
      safe_name = sanitize_filename(name)
      safe_content = sanitize_filename(content)
      file_name = f"map/{safe_name}_{safe_content}.zip"

      # 拼接下载链接
      encoded_size = quote(size)
      down_url = f"https://bzdt-sbsm.obs.cn-north-4.myhuaweicloud.com/prototype/{encoded_size}/{id_}a.zip"

      print(f"正在下载: {down_url}")

      try:
            with requests.get(down_url, stream=True, timeout=30) as r:
                r.raise_for_status()# 如果状态码不是 200,抛出异常
                with open(file_name, "wb") as f:
                  for chunk in r.iter_content(chunk_size=1024 * 1024):# 流式下载
                        if chunk:
                            f.write(chunk)
            print(f"已保存至: {file_name}")
      except Exception as e:
            print(f"下载失败: {down_url}, 错误: {e}")
                  





页: [1]
查看完整版本: 地图服务爬取