scrapy基础

from qiubaiPro.items import QiubaiproItemclass QiubaiSpider(scrapy.Spider):    name = 'qiubai'    # allowed_domains = ['www.qiushibaike.com/text']    start_urls = ['https://www.qiushibaike.com/text/']    def parse(self, response):        #建议使用xpath进行指定内容的解析,因为框架集成了xpath解析的接口        #段子的内容和作者        #可以直接用response+点来调用xpath        div_list = response.xpath('//div[@]/div')        for div in div_list:            #extract()该方法可以将selector对象中存储的数据值拿到,是一个列表,取值要用索引            # author = div.xpath('./div/a[2]/h2/text.extract()[0]            # extract_first() 等于extract()[0]            author = div.xpath('./div/a[2]/h2/text()').extract_first()            content = div.xpath('.//div[@]/span/text()').extract_first()            #1  将解析到的数据值(author和cont)存储到items对象,需要去QiubaiproItem类里声明属性            item = QiubaiproItem()            item['author'] = author            item['content'] = content            #2 将item对象提交给管道,去pipelines文件编写代码            yield item

# 基于管道存储的代码class QiubaiproPipeline:    fp = None    # 整个爬虫过程中,该方法只会在开始爬虫的时候被调用一次    def open_spider(self, spider):        print('开始爬虫')        self.fp = open('./qiubai_pipe.txt', 'w', encoding='utf-8')    # 该方法可以接受爬虫文件中提交过来的item对象,并且对item对象中存储的页面数据进行持久化存储    # 参数:item表示的就是接收到的item对象    # 每当爬虫文件向管道提交一次item,该方法就会被执行一次    def process_item(self, item, spider):        # 取出item对象中存储的数据值        author = item['author']        content = item['content']        # 持久化存储        self.fp.write(author + ':' + content+'\n\n\n')        return item    #该方法只会在爬虫结束的时候被调用一次    def close_spider(self,spider):        print('爬虫结束')        self.fp.close()

# 编写向mysql数据库中存储数据的相关代码import pymysqlclass QiubaiproPipeline:    conn = None    cursor = None    def open_spider(self, spider):        print('开始爬虫')        # 链接数据库        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='qiubai')    def process_item(self, item, spider):        # 1. 链接数据库(创建好数据库和要存的表)        # 2. 执行sql语句        sql = 'insert into qiubai values("%s","%s")' % (item['author'], item['content'])        #创建游标对象        self.cursor = self.conn.cursor()        # 3. 提交事务        try:            self.cursor.execute            self.conn.commit()        except Exception as e:            print            self.conn.rollback()        return item    def close_spider(self, spider):        print('爬虫结束')        self.cursor.close() #关闭游标对象        self.conn.close() #关闭连接对象

import redisclass QiubaiproPipeline:    conn = None    def open_spider(self, spider):        print('开始爬虫')        # 1. 链接数据库        self.conn = redis.Redis(host='127.0.0.1', port=6379)    def process_item(self, item, spider):        # 2. 执行语句        dict = {            'author': item['author'],            'content': item['content'],        }        # 创建一个名为data的列表,传入数据dict        self.conn.lpush('data', dict)        return item

相关文章