python爬虫笔记

最近有个需求,项目组长安排我弄两份国家行政区划的数据文件,excel或者sql文件都行,一份中国的一份南非的,这可把我给整蒙了,中国的还好,爬一下国家统计局行政区划2019年的官网就好了,南非的从哪整去。

话不多说,找了很多资料,发现要么不全,要么不是最新,他们的数据肯定也是自己弄来的,我为啥就不能自己也爬一波,说干就干,先来爬中国行政区划统计数据:

首先是国家统计局官网: 我们要爬取的地址就是介个,

https://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/

先来只红红的雄鸡~ChinaNo.1

好了 1557755930197

我们先琢磨下这个网站是怎么布局的:

11314

通过分析首页源码我们可以得到如下3点:

页面的整个布局是用的table标签来控制的,也就是说我们如果要通过beautifulsoup来抓取信息,那么一定要注意,上图中不是只要标注了省市地区的地方采用的才是表格,整个页面中存在多个表格,因此是不可以直接通过表格

    print('开始抓取省份信息……')
    province_url = url + index_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(province_url, headers=headers)
    request.encoding = 'gbk'
    province_html_text = str(request.text)
    soup = BeautifulSoup(province_html_text, "html.parser")
    province_tr_list = soup.select('.provincetr a')

再次我们再看一下一般的数据页面(一般的数据页面包括市级、县级、镇级这三级数据展示页面):

  之所以要把上述三个页面放在一起,是因为通过分析我们可以发现,这三级数据的数据页面完全一致,唯一不同的就是在html源码数据表格中的数据行tr的class属性不一致,分别对应为:citytr,countrytrhe towntr。其他均一致。这样我们就可以用一个通用的方法解决这三个页面的数据爬取. 111926114 遍历市级别的信息:

 city_url = url + province_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(city_url, headers=headers)
    request.encoding = 'gbk'
    city_html_text = str(request.text)
    soup = BeautifulSoup(city_html_text, "html.parser")
    city_tr_list = soup.select('.citytr')
    # 遍历市级城市列表信息

然后遍历区级别的信息:

    for city_tr in city_tr_list:
        if city_tr:
            file = open('mysql_v2/area.sql', 'a+', encoding='utf-8')
            city_a_info = city_tr.select('a')
            city_href = city_a_info[0].attrs['href']
            city_code = city_a_info[0].text[:6]
            city_name = city_a_info[1].text
            city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'
            file.write(city_info)
            file.close()
            print('已写入市级:', city_info)
            # 区级
            get_area(city_href, city_code)

数据保存为sql格式,就只需要导入数据库就OK了~

province_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(province_code) + '", "' + str(province_name) + '", "' + str(parent_code) + '", "' + str(level) + '");\n'

city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'

area_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(area_code) + '", "' + str(area_name) + '", "' + str(city_code) + '", "' + str(level) + '");\n'

最后附上完整的代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
爬取国家统计局最新地址库
省市区三级(一张表)
author: icefire
time: 2019-03-13
"""

import requests
from bs4 import BeautifulSoup
import os


def get_province(index_href):
    """抓取省份信息"""
    print('开始抓取省份信息……')
    province_url = url + index_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(province_url, headers=headers)
    request.encoding = 'gbk'
    province_html_text = str(request.text)
    soup = BeautifulSoup(province_html_text, "html.parser")
    province_tr_list = soup.select('.provincetr a')
    # 遍历省份列表信息
    level = '1'
    parent_code = ''
    for province_tr in province_tr_list:
        if province_tr:
            file = open('china_data/area.sql', 'a+', encoding='utf-8')
            province_href = province_tr.attrs['href']
            province_no = province_href.split('.')[0]
            province_code = province_no + '0000'
            province_name = province_tr.text
            province_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(province_code) + '", "' + str(province_name) + '", "' + str(parent_code) + '", "' + str(level) + '");\n'
            file.write(province_info)
            file.close()
            print('已写入省级:', province_info)
            # 市级
            get_city(province_href, province_code)
    print('抓取省份信息结束!')


def get_city(province_href, province_code):
    """抓取市级城市信息"""
    print('开始抓取市级信息')
    city_url = url + province_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(city_url, headers=headers)
    request.encoding = 'gbk'
    city_html_text = str(request.text)
    soup = BeautifulSoup(city_html_text, "html.parser")
    city_tr_list = soup.select('.citytr')
    # 遍历市级城市列表信息
    level = '2'
    for city_tr in city_tr_list:
        if city_tr:
            file = open('china_data/area.sql', 'a+', encoding='utf-8')
            city_a_info = city_tr.select('a')
            city_href = city_a_info[0].attrs['href']
            city_code = city_a_info[0].text[:6]
            city_name = city_a_info[1].text
            city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'
            file.write(city_info)
            file.close()
            print('已写入市级:', city_info)
            # 区级
            get_area(city_href, city_code)
    print('抓取市级城市结束!')


def get_area(city_href, city_code):
    """抓取区级信息"""
    print('开始抓取区级信息')
    area_url = url + city_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(area_url, headers=headers)
    request.encoding = 'gbk'
    area_html_text = str(request.text)
    soup = BeautifulSoup(area_html_text, "html.parser")
    area_tr_list = soup.select('.countytr')
    # 遍历区级列表信息
    file = open('china_data/area.sql', 'a+', encoding='utf-8')
    level = '3'
    for area_tr in area_tr_list:
        area_a_info = area_tr.select('td')
        if area_a_info:
            area_code = area_a_info[0].text[:6]
            area_name = area_a_info[1].text
            area_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(area_code) + '", "' + str(area_name) + '", "' + str(city_code) + '", "' + str(level) + '");\n'
            file.write(area_info)
            print('已写入区级:', area_info)
    print('抓取区级信息结束!')
    file.close()


# 程序主入口
if __name__ == "__main__":
    url = 'https://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'
    # 创建json目录
    mysql_folder = 'china_data/'
    if not os.path.exists(mysql_folder):
        os.makedirs(mysql_folder)
    else:
        # 清空城市和地区
        city_file = open('china_data/area.sql', 'w', encoding='utf-8')
        city_file.write('')
        city_file.close()
    get_province('index.html')


OK,搞定,至于南非的行政区划,再琢磨琢磨,之后找到解决方案了再补上。

使用豆瓣源 安装 插件库

pip install requests -i "https://pypi.doubanio.com/simple/"


医不自医,人不渡己