爬取起点小说网(二)设计代码

摘要:
1.安装pipinstallScrapy#一定要以管理员身份运行dos窗口condainstallscrapy2.创建项目scrapystartprojectnovel3.创建qidianClass4.py文件,爬取小说一级分类,二级分类,名称和链接,分别存入mongdb和redis库中对应表中importscrapyfromscrapy.selectorimportHtmlXPathSelect

1.安装
pip install Scrapy
#一定要以管理员身份运行dos窗口
conda install scrapy
2.创建项目
scrapy startproject novel

3.创建qidianClass4.py文件,爬取小说一级分类,二级分类,名称和链接,分别存入mongdb和redis库中对应表中

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from urllib.request import urlopen
#from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
from bson.objectid import ObjectId
import pymongo
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel            #库名dianping
collection = db.novelclass

import redis
r = redis.Redis(host='127.0.0.1', port=6379, db=0)


class qidianClassSpider(scrapy.Spider):
    name = "qidianClass4"
    allowed_domains = ["qidian.com"]  # 允许访问的域
    start_urls = [
        "https://www.qidian.com/all",
    ]

    # #每爬完一个网页会回调parse方法
    # def parse(self, response):
    #     print(response.body.decode('utf-8'))
    def parse(self, response):

        hxs = HtmlXPathSelector(response)
        hxsObj = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a')
        for secItem in hxsObj:
            className = secItem.select('text()').extract()
            classUrl = secItem.select('@href').extract()
            classUrl = 'https:' + classUrl[0]
            print(className[0])
            print(classUrl)
            classid = self.insertMongo(className[0],None)
            request = Request(classUrl, callback=lambda response, pid=str(classid): self.parse_subClass(response, pid))
            yield request
            print("======================")
    def parse_subClass(self, response,pid):

        hxs = HtmlXPathSelector(response)
        hxsObj = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a')
        for secItem in hxsObj:
            className2 = secItem.select('text()').extract()
            classUrl2 = secItem.select('@href').extract()
            print(className2)
            print('----------------------------')
            classUrl2 = 'https:' + classUrl2[0]
            print(classUrl2)
            classid = self.insertMongo(className2[0], ObjectId(pid))
            self.pushRedis(classid, pid, classUrl2)

    def insertMongo(self, classname, pid):
        classid = collection.insert({'classname': classname, 'pid': pid})
        return classid

    def pushRedis(self, classid, pid, url):
        novelurl = '%s,%s,%s' % (classid, pid, url)
        r.lpush('novelurl', novelurl)

4..创建qidianNovel.py文件,爬取小说名称和链接,分别存入mongdb和redis库中对应表中

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor

import pymongo

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelname

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovel"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
        start_urls = []
        urlList = r.lrange('novelurl', 0, -1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 3:
            #     break
        print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        pid = classInfo['pid']
        num = classInfo['num']
        if num > 3:
            return None
        hxs = HtmlXPathSelector(response)
        hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a')
        for secItem in hxsObj:
            className = secItem.select('text()').extract()
            classUrl = secItem.select('@href').extract()
            classUrl = 'https:' + classUrl[0]
            print(className[0])
            print(classUrl)
            classid =self.insertMongo(className[0],objectid)
            self.pushRedis(classid,objectid, classUrl)

        nextPage = self.nextUrl(response)
            # sleep(0.3)
            # --------------------------不用调用方法直接取下一页------------------------------------------------------------------------------
        # nextPages= hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
        # nextPages = nextPages.select('@href').extract()
        # nextPage = "https:" + nextPages[0]

        classInfo['num'] += 1
        self.dict[nextPage] = classInfo
        request = Request(nextPage, callback=self.parse)
        yield request
        print('--------end--------------')
# ---------------------------------------------------------------------------------------------------------------
# ===================获取下一页链接方法=======================================================
    def nextUrl(self, response):
        hxs = HtmlXPathSelector(response)
        # nextPage = hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
        nextPage = hxs.select('//a[@class="lbf-pagination-next "]')
        # print(nextPage.extract())
        if len(nextPage) == 1:
            nextPage = nextPage.select('@href').extract()
            nextPage = "https:" + nextPage[0]

            print('==============' + nextPage + '====================')
            return nextPage

            # =====================获取下一页链接结束==================================================


    def insertMongo(self, className, pid):
        classid = collection.insert({'classname': className, 'pid': pid})
        return classid


    def pushRedis(self, classid, pid, classUrl):
        novelnameurl = '%s,%s,%s,' % (classid, pid, classUrl)
        r.lpush('novelnameurl', novelnameurl)

5.创建qidianNovelChapterInfo.py文件,爬取小说名称下的章节和链接,分别存入mongdb和redis库的中的对应表中

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelChapterInfo

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelChapterInfo"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
        start_urls = []
        urlList = r.lrange('novelnameurl', 0, -1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 1:
            #     break
        print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        pid = classInfo['pid']
        # num = classInfo['num']
        # if num > 3:
        #     return None
        html = response.body.decode('utf-8')
        selector = etree.HTML(html)
        novelChapters = selector.xpath('//ul[@class="cf"]/li/a')
        for item in novelChapters:
            novelChapter= item.text
            print(item.text)
            novelChapterUrl='https:'+item.get('href')
            print(novelChapterUrl)
            # print(item.get('href'))

            classid = self.insertMongo(novelChapter, objectid)
            self.pushRedis(classid, objectid, novelChapterUrl)

    def insertMongo(self,novelChapter, pid):
        classid = collection.insert({'novelChapter': novelChapter,'pid': pid})
        return classid

    def pushRedis(self, classid,pid, novelChapterUrl):
        novelChapterUrl = '%s,%s,%s' % ( classid , pid, novelChapterUrl)
        r.lpush('novelChapterUrl', novelChapterUrl)

6.创建qidianNovelWorksInfo.py文件,爬取小说基本信息,更新到原有的存小说名称的mongdb(novel)库小说名称表中

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
from bson.objectid import ObjectId

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelname

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelWorksInfo"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
        start_urls = []
        urlList = r.lrange('novelnameurl', 0, -1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 5:
            #     break
        print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        objectid2 = ObjectId(objectid)
        pid = classInfo['pid']
        # num = classInfo['num']
        # if num > 3:
        #     return None
        html = response.body.decode('utf-8')
        selector = etree.HTML(html)
        workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()')
        novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()')
        novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()')
        novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()')
        objClass=novelClass[0]
        sonClass=novelClass[1]
        print("小说名:"+novelName[0])
        print("作者名:"+workName[0])
        print("状态:" + novelState[0])
        print("小说分类:"+objClass)
        print("小说分类2:" + sonClass)

        db.novelname.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}})


        print('--------end--------------')
# ---------------------------------------------------------------------------------------------------------------

# def updateMongo(self, workName,novelName,novelState,objClass,sonClass,objectid2):
    #     # classid = collection.update({'workName': workName,'novelName':novelName,'novelState':novelState,'objClass':objClass,'sonClass':sonClass,'pid': pid})
    #     classid = collection.update({"_id":objectid2 },{"$set":{'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass, 'sonClass': sonClass}})
    #     return classid

7.创建qidianNovelChapterContent.py文件,爬取小说章节内容,更新到原有的存小说章节的mongdb(novel)库下章节表

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
from bson.objectid import ObjectId

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelChapterInfo

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelChapterContent"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        #qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",]
        start_urls = []
        urlList = r.lrange('novelChapterUrl', 0,-1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 10:
            #     break
        # print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        objectid2 = ObjectId(objectid)
        pid = classInfo['pid']
        num = classInfo['num']
        ii = ""
        #==================================================================================
        html = response.body.decode('utf-8')
        selector = etree.HTML(html)
        novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p')
        # print(novelChaptersContent)
        for item in novelChaptersContents:
            novelChaptersContent=item.text
            # print(novelChaptersContent)
            ii = novelChaptersContent + ii
            # classid = collection.insert({'content': ii, 'pid': pid})
            db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent':ii}})
        # sleep(0.3)
        print('------------------------------------------------------')

# ---------------------------------------------------------------------------------------------------------------
    # def nextChapter(self, response):
    #     hxs = HtmlXPathSelector(response)
    #     nextChapter = hxs.select('//div[@"chapter-control dib-wrap"]/a[@id = "j_chapterNext"]')
    #     # print(nextPage.extract())
    #     if len(nextChapter) == 1:
    #         nextChapter = nextChapter.select('@href').extract()
    #         nextChapter= "https:" + nextChapter[0]
    #         print('==============' + nextChapter + '====================')
    #         return nextChapter 

9.运行,在项目根目录下dos执行:
scrapy crawl dmoz(对应py文件中的name=" ")

最近一直忙于手中的项目,一直没有整理,抱歉

免责声明:文章转载自《爬取起点小说网(二)设计代码》仅用于学习参考。如对内容有疑问,请及时联系本站处理。

上篇Oracle中进行层级查询的方式——start with...connect bysap ABAP关于Data Reference的使用FIELDSYMBOLS下篇

宿迁高防,2C2G15M,22元/月;香港BGP,2C5G5M,25元/月 雨云优惠码:MjYwNzM=

相关文章

AWVS安装与基础

--- 下一步 AWVS简介1.AWVS是一个自动化WEB应用程序安全测试工具,审计检查漏洞sql注入,xss夸站点脚本攻击和其他能被黑客利用的漏洞和网页应用2.为什么要用AWVS 黑客具备广泛攻击手段,sql注入,xss,文件包含,目录遍历,参数篡改,绕过等,用软件来解决功能简介1.整站扫描2.站点爬行3.发现目标4.子域名扫描5.盲SQL注射6.HTT...

瀑布流的三种实现方式(原生js+jquery+css3)

前言 项目需求要弄个瀑布流的页面,用的是waterfall这个插件,感觉还是可以的,项目赶就没自己的动手写。最近闲来没事,就自己写个。大致思路理清楚,还是挺好实现的... 原生javascript版 <!DOCTYPE html> <html lang="en"> <head> <meta charset=...

ssh 提示Connection closed by * 的解决方案

使用ssh方式连接linux系统时,发现一直上报这个错误: Connection closed by 192.168.3.71 port 22 刚开始还以为是端口被防火墙禁止了呢,通过关闭和查看,并没有发现 什么错误,这就要详细的分析了。到底是哪儿出的问题呢? 根据思路来,先看log:从log可以看出,出错的原因很明显,就是加密文件权限有问题了,接下来就去...

(三)Cacti的使用

一、Cacti的使用 1.界面介绍 登陆Cacti后,可以看到左上角是两个选项卡,“console”和“graphs”。console表示控制台,在此进行所有的配置等操作;而graphs则是用来查看所有服务器的性能图像的界面。 2.console菜单 Create: New Graphs——创建新图像的快捷方式; Management: Graph Man...

USB的挂起和唤醒 (Suspend and Resume)〈转〉

  USB协议的第9章讲到USB可见设备状态[Universal Serial Bus Specification, Section 9.1.1, Pg 239],分为连接(Attached),上电(Powered),默认(Default),地址(Address),配置(Configured)和挂起(Suspended)6个状态。所谓可见,即USB系统和主...

list 导出为excel

/// <summary> /// 将一组对象导出成EXCEL /// </summary> /// <typeparam name="T">要导出对象的类型</typeparam> /// <param name="objLis...