首页Python+大数据技术文章正文

学会python的好处，轻松抓取知乎数据

更新时间:2019年02月16日12时26分来源:python培训浏览次数:

学会python以后真的是用处很大，下面传智播客分享一个关于学会python以后抓取知乎数据的案例。

安装Scrapy爬虫框架

关于如何安装Python以及Scrapy框架，这里不做介绍，请自行网上搜索。

初始化

安装好Scrapy后，执行 scrapy startproject myspider

接下来你会看到 myspider 文件夹，目录结构如下：

scrapy.cfg

myspider

items.py

pipelines.py

settings.py

__init__.py

spiders

__init__.py

编写爬虫文件

在spiders目录下新建 users.py

# -*- coding: utf-8 -*-

import scrapy

import os

import time

from zhihu.items import UserItem

from zhihu.myconfig import UsersConfig # 爬虫配置

class UsersSpider(scrapy.Spider):

    name = 'users'

    domain = 'https://www.zhihu.com'

    login_url = 'https://www.zhihu.com/login/email'

    headers = {

        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",

        "Accept-Language": "zh-CN,zh;q=0.8",

        "Connection": "keep-alive",

        "Host": "www.zhihu.com",

        "Upgrade-Insecure-Requests": "1",

        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36"

}

    def __init__(self, url = None):

        self.user_url = url

    def start_requests(self):

        yield scrapy.Request(

            url = self.domain,

            headers = self.headers,

            meta = {

                'proxy': UsersConfig['proxy'],

                'cookiejar': 1

},

            callback = self.request_captcha

)

    def request_captcha(self, response):

        # 获取_xsrf值

        _xsrf = response.css('input[name="_xsrf"]::attr(value)').extract()[0]

        # 获取验证码地址

        captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000)

        # 准备下载验证码

        yield scrapy.Request(

            url = captcha_url,

            headers = self.headers,

            meta = {

                'proxy': UsersConfig['proxy'],

                'cookiejar': response.meta['cookiejar'],

                '_xsrf': _xsrf

},

            callback = self.download_captcha

)

    def download_captcha(self, response):

        # 下载验证码

        with open('captcha.gif', 'wb') as fp:

            fp.write(response.body)

        # 用软件打开验证码图片

        os.system('start captcha.gif')

        # 输入验证码

        print 'Please enter captcha: '

        captcha = raw_input()

        yield scrapy.FormRequest(

            url = self.login_url,

            headers = self.headers,

            formdata = {

                'email': UsersConfig['email'],

                'password': UsersConfig['password'],

                '_xsrf': response.meta['_xsrf'],

                'remember_me': 'true',

                'captcha': captcha

},

            meta = {

                'proxy': UsersConfig['proxy'],

                'cookiejar': response.meta['cookiejar']

},

            callback = self.request_zhihu

)

    def request_zhihu(self, response):

        yield scrapy.Request(

            url = self.user_url + '/about',

            headers = self.headers,

            meta = {

                'proxy': UsersConfig['proxy'],

                'cookiejar': response.meta['cookiejar'],

                'from': {

                    'sign': 'else',

                    'data': {}

}

},

            callback = self.user_item,

            dont_filter = True

)

        yield scrapy.Request(

            url = self.user_url + '/followees',

            headers = self.headers,

            meta = {

                'proxy': UsersConfig['proxy'],

                'cookiejar': response.meta['cookiejar'],

                'from': {

                    'sign': 'else',

                    'data': {}

}

},

            callback = self.user_start,

            dont_filter = True

)

        yield scrapy.Request(

            url = self.user_url + '/followers',

            headers = self.headers,

            meta = {

                'proxy': UsersConfig['proxy'],

                'cookiejar': response.meta['cookiejar'],

                'from': {

                    'sign': 'else',

                    'data': {}

}

},

            callback = self.user_start,

            dont_filter = True

)

    def user_start(self, response):

        sel_root = response.xpath('//h2[@class="zm-list-content-title"]')

        # 判断关注列表是否为空

        if len(sel_root):

            for sel in sel_root:

                people_url = sel.xpath('a/@href').extract()[0]

                yield scrapy.Request(

                    url = people_url + '/about',

                    headers = self.headers,

                    meta = {

                        'proxy': UsersConfig['proxy'],

                        'cookiejar': response.meta['cookiejar'],

                        'from': {

                            'sign': 'else',

                            'data': {}

}

},

                    callback = self.user_item,

                    dont_filter = True

)

                yield scrapy.Request(

                    url = people_url + '/followees',

                    headers = self.headers,

                    meta = {

                        'proxy': UsersConfig['proxy'],

                        'cookiejar': response.meta['cookiejar'],

                        'from': {

                            'sign': 'else',

                            'data': {}

}

},

                    callback = self.user_start,

                    dont_filter = True

)

                yield scrapy.Request(

                    url = people_url + '/followers',

                    headers = self.headers,

                    meta = {

                        'proxy': UsersConfig['proxy'],

                        'cookiejar': response.meta['cookiejar'],

                        'from': {

                            'sign': 'else',

                            'data': {}

}

},

                    callback = self.user_start,

                    dont_filter = True

)

    def user_item(self, response):

        def value(list):

            return list[0] if len(list) else ''

        sel = response.xpath('//div[@class="zm-profile-header ProfileCard"]')

        item = UserItem()

        item['url'] = response.url[:-6]

        item['name'] = sel.xpath('//a[@class="name"]/text()').extract()[0].encode('utf-8')

        item['bio'] = value(sel.xpath('//span[@class="bio"]/@title').extract()).encode('utf-8')

        item['location'] = value(sel.xpath('//span[contains(@class, "location")]/@title').extract()).encode('utf-8')

        item['business'] = value(sel.xpath('//span[contains(@class, "business")]/@title').extract()).encode('utf-8')

        item['gender'] = 0 if sel.xpath('//i[contains(@class, "icon-profile-female")]') else 1

        item['avatar'] = value(sel.xpath('//img[@class="Avatar Avatar--l"]/@src').extract())

        item['education'] = value(sel.xpath('//span[contains(@class, "education")]/@title').extract()).encode('utf-8')

        item['major'] = value(sel.xpath('//span[contains(@class, "education-extra")]/@title').extract()).encode('utf-8')

        item['employment'] = value(sel.xpath('//span[contains(@class, "employment")]/@title').extract()).encode('utf-8')

        item['position'] = value(sel.xpath('//span[contains(@class, "position")]/@title').extract()).encode('utf-8')

        item['content'] = value(sel.xpath('//span[@class="content"]/text()').extract()).strip().encode('utf-8')

        item['ask'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[2]/span[@class="num"]/text()').extract()[0])

        item['answer'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[3]/span[@class="num"]/text()').extract()[0])

        item['agree'] = int(sel.xpath('//span[@class="zm-profile-header-user-agree"]/strong/text()').extract()[0])

        item['thanks'] = int(sel.xpath('//span[@class="zm-profile-header-user-thanks"]/strong/text()').extract()[0])

        yield item

　　添加爬虫配置文件

　　在myspider目录下新建myconfig.py，并添加以下内容，将你的配置信息填入相应位置

　　# -*- coding: utf-8 -*-

　　UsersConfig = {

　　# 代理

　　'proxy': '',

　　# 知乎用户名和密码

　　'email': 'your email',

　　'password': 'your password',

　　}

　　DbConfig = {

　　# db config

　　'user': 'db user',

　　'passwd': 'db password',

　　'db': 'db name',

　　'host': 'db host',

　　}

　　修改items.py

　　# -*- coding: utf-8 -*-

　　import scrapy

　　class UserItem(scrapy.Item):

　　# define the fields for your item here like:

　　url = scrapy.Field()

　　name = scrapy.Field()

　　bio = scrapy.Field()

　　location = scrapy.Field()

　　business = scrapy.Field()

　　gender = scrapy.Field()

　　avatar = scrapy.Field()

　　education = scrapy.Field()

　　major = scrapy.Field()

　　employment = scrapy.Field()

　　position = scrapy.Field()

　　content = scrapy.Field()

　　ask = scrapy.Field()

　　answer = scrapy.Field()

　　agree = scrapy.Field()

　　thanks = scrapy.Field()

　　1

　　2

　　3

　　4

　　5

　　6

　　7

　　8

　　9

　　10

　　11

　　12

　　13

　　14

　　15

　　16

　　17

　　18

　　19

　　20

　　21

　　# -*- coding: utf-8 -*-

　　import scrapy

　　class UserItem(scrapy.Item):

　　# define the fields for your item here like:

　　url = scrapy.Field()

　　name = scrapy.Field()

　　bio = scrapy.Field()

　　location = scrapy.Field()

　　business = scrapy.Field()

　　gender = scrapy.Field()

　　avatar = scrapy.Field()

　　education = scrapy.Field()

　　major = scrapy.Field()

　　employment = scrapy.Field()

　　position = scrapy.Field()

　　content = scrapy.Field()

　　ask = scrapy.Field()

　　answer = scrapy.Field()

　　agree = scrapy.Field()

　　thanks = scrapy.Field()

　　将用户数据存入mysql数据库

　　修改pipelines.py

　　# -*- coding: utf-8 -*-

　　import MySQLdb

　　import datetime

　　from zhihu.myconfig import DbConfig

　　class UserPipeline(object):

　　def __init__(self):

　　self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True)

　　self.cursor = self.conn.cursor()

　　# 清空表

　　# self.cursor.execute('truncate table weather;')

　　# self.conn.commit()

　　def process_item(self, item, spider):

　　curTime = datetime.datetime.now()

　　try:

　　self.cursor.execute(

　　"""INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at)

　　VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",

　　(

　　item['url'],

　　item['name'],

　　item['bio'],

　　item['location'],

　　item['business'],

　　item['gender'],

　　item['avatar'],

　　item['education'],

　　item['major'],

　　item['employment'],

　　item['position'],

　　item['content'],

　　item['ask'],

　　item['answer'],

　　item['agree'],

　　item['thanks'],

　　curTime

　　)

　　)

　　self.conn.commit()

　　except MySQLdb.Error, e:

　　print 'Error %d %s' % (e.args[0], e.args[1])

　　return item

　　1

　　2

　　3

　　4

　　5

　　6

　　7

　　8

　　9

　　10

　　11

　　12

　　13

　　14

　　15

　　16

　　17

　　18

　　19

　　20

　　21

　　22

　　23

　　24

　　25

　　26

　　27

　　28

　　29

　　30

　　31

　　32

　　33

　　34

　　35

　　36

　　37

　　38

　　39

　　40

　　41

　　42

　　43

　　44

　　# -*- coding: utf-8 -*-

　　import MySQLdb

　　import datetime

　　from zhihu.myconfig import DbConfig

　　class UserPipeline(object):

　　def __init__(self):

　　self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True)

　　self.cursor = self.conn.cursor()

　　# 清空表

　　# self.cursor.execute('truncate table weather;')

　　# self.conn.commit()

　　def process_item(self, item, spider):

　　curTime = datetime.datetime.now()

　　try:

　　self.cursor.execute(

　　"""INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at)

　　VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",

　　(

　　item['url'],

　　item['name'],

　　item['bio'],

　　item['location'],

　　item['business'],

　　item['gender'],

　　item['avatar'],

　　item['education'],

　　item['major'],

　　item['employment'],

　　item['position'],

　　item['content'],

　　item['ask'],

　　item['answer'],

　　item['agree'],

　　item['thanks'],

　　curTime

　　)

　　)

　　self.conn.commit()

　　except MySQLdb.Error, e:

　　print 'Error %d %s' % (e.args[0], e.args[1])

　　return item

　　修改settings.py

　　找到 ITEM_PIPELINES，改为：

　　ITEM_PIPELINES = {

　　'myspider.pipelines.UserPipeline': 300,

　　}

　　1

　　2

　　3

　　ITEM_PIPELINES = {

　　'myspider.pipelines.UserPipeline': 300,

　　}

　　在末尾添加，设置爬虫的深度

　　DEPTH_LIMIT=10

　　1

　　DEPTH_LIMIT=10

　　爬取知乎用户数据

　　确保MySQL已经打开，在项目根目录下打开终端，

　　执行 scrapy crawl users -a url=https://www.zhihu.com/people/，

　　其中user为爬虫的第一个用户，之后会根据该用户关注的人和被关注的人进行爬取数据

　　接下来会下载验证码图片，若未自动打开，请到根目录下打开 captcha.gif，在终端输入验证码

　　数据爬取Loading…